diff --git a/.clang-format b/.clang-format index 04f2bbaf85b2c..a4de8e7be8e07 100644 --- a/.clang-format +++ b/.clang-format @@ -6,11 +6,11 @@ # The basic usage is, # clang-format -i -style=file PATH/TO/SOURCE/CODE # -# The -style=file implicit use ".clang-format" file located in one of -# parent directory. +# The -style=file implicit use ".clang-format" file located in one of +# parent directory. # The -i means inplace change. # -# The document of clang-format is +# The document of clang-format is # http://clang.llvm.org/docs/ClangFormat.html # http://clang.llvm.org/docs/ClangFormatStyleOptions.html --- @@ -20,7 +20,7 @@ IndentWidth: 2 TabWidth: 2 ContinuationIndentWidth: 4 AccessModifierOffset: -1 # The private/protected/public has no indent in class -Standard: Cpp11 +Standard: Cpp11 AllowAllParametersOfDeclarationOnNextLine: true BinPackParameters: false BinPackArguments: false diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 1fcb3dc4f521d..7b62f131b9587 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -53,7 +53,6 @@ python/paddle/base/compiler.py @XiaoguangHu01 @zhiqiu @Xreki @qili93 @Aurelius84 python/paddle/base/dygraph/layers.py @JiabinYang @phlrain python/paddle/base/framework.py @XiaoguangHu01 @zhiqiu @Xreki @qili93 @Aurelius84 python/paddle/base/__init__.py @phlrain @Aurelius84 @qili93 -python/paddle/base/parallel_executor.py @Xreki @zhhsplendid @Aurelius84 python/paddle/base/tests/unittests/white_list/check_op_sequence_batch_1_input_white_list.py @Aurelius84 @phlrain python/paddle/base/tests/unittests/white_list/check_op_sequence_instance_0_input_white_list.py @Aurelius84 @phlrain python/paddle/base/tests/unittests/white_list/check_shape_white_list.py @hong19860320 @Aurelius84 @phlrain diff --git a/CMakeLists.txt b/CMakeLists.txt index 0aa41a26d700e..f0b2fa79d362a 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -99,6 +99,9 @@ if(WITH_GPU AND WITH_ROCM) endif() if(WITH_GPU AND NOT APPLE) + if(WITH_PIP_CUDA_LIBRARIES AND CMAKE_SYSTEM_NAME STREQUAL "Windows") + add_definitions(-DPADDLE_WITH_PIP_CUDA_LIBRARIES) + endif() #(Note risemeup1): The cudart dynamic library libcudart.so is used by set CUDA_USE_STATIC_CUDA_RUNTIME and CMAKE_CUDA_FLAGS if(CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") @@ -107,8 +110,8 @@ if(WITH_GPU AND NOT APPLE) CACHE BOOL "" FORCE) set(CMAKE_CUDA_FLAGS "--cudart shared") if(WITH_PIP_CUDA_LIBRARIES) - #(Note risemeup1): Flag 'WITH_PIP_CUDA_LIBRARIES' will be used in dynamic_loader.cc to search for CUDA-related .so files through the Python libraries provided by NVIDIA. - add_definitions(-DWITH_PIP_CUDA_LIBRARIES) + #(Note risemeup1): Flag 'PADDLE_WITH_PIP_CUDA_LIBRARIES' will be used in dynamic_loader.cc to search for CUDA-related .so files through the Python libraries provided by NVIDIA. + add_definitions(-DPADDLE_WITH_PIP_CUDA_LIBRARIES) endif() endif() enable_language(CUDA) diff --git a/cmake/PaddleConfig.cmake.in b/cmake/PaddleConfig.cmake.in index d32c23f6f6edd..e55038bb77c63 100644 --- a/cmake/PaddleConfig.cmake.in +++ b/cmake/PaddleConfig.cmake.in @@ -12,7 +12,7 @@ get_filename_component(PADDLE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_FILE}/../.." ABSOLUTE) # include directories -set(PADDLE_INCLUDE_DIRS +set(PADDLE_INCLUDE_DIRS ${PADDLE_INSTALL_PREFIX}/include ${PADDLE_INSTALL_PREFIX}/include/third_party ) diff --git a/cmake/make_resource.py b/cmake/make_resource.py index ad8ee179d60c2..e80900da58777 100644 --- a/cmake/make_resource.py +++ b/cmake/make_resource.py @@ -24,7 +24,7 @@ "const unsigned char " + var + "[] = {" - + ",".join(["0x%02x" % ord(c) for c in open(res).read()]) + + ",".join([f"0x{ord(c):02x}" for c in open(res).read()]) + ",0};\n" + "const unsigned " + var diff --git a/paddle/cinn/ast_gen_ius/ast_gen.cc b/paddle/cinn/ast_gen_ius/ast_gen.cc index 42986fff0dbb1..54805f2c78f50 100644 --- a/paddle/cinn/ast_gen_ius/ast_gen.cc +++ b/paddle/cinn/ast_gen_ius/ast_gen.cc @@ -131,6 +131,7 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) { } else { iter_values.push_back(axis_vars[i]); } + ir::TryElevateInt32ToInt64({ir::Expr(axis_vars[i]), shape[i]}); } VLOG(4) << "iter_value.size() and block_vars.size() is " << iter_values.size() << " " << block_vars.size(); @@ -167,6 +168,7 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) { } else { reduce_iter_values.push_back(axis_vars[i]); } + ir::TryElevateInt32ToInt64({ir::Expr(axis_vars[i]), shape[i]}); } VLOG(4) << "ast gen: reduce body is after replace 0" << reduce_body; for (int i = 0; i < reduce_axis.size(); ++i) { @@ -227,6 +229,9 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) { ir::ScheduleBlock::Make( reduce_block_vars, {}, {}, tensor->name, reduce_body)); for (int i = static_cast(reduce_axis.size()) - 1; i >= 0; --i) { + ir::TryElevateInt32ToInt64({reduce_axis[i], + reduce_axis[i]->lower_bound, + reduce_axis[i]->upper_bound}); reduce_body = ir::For::Make(reduce_axis[i], reduce_axis[i]->lower_bound, reduce_axis[i]->upper_bound, diff --git a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc index 6f00ee34813d1..c51ba89806956 100644 --- a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc +++ b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc @@ -32,7 +32,7 @@ #include "paddle/cinn/lang/lower.h" #include "paddle/cinn/optim/optimize.h" #include "paddle/cinn/optim/transform_gpu_forloop.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace auto_schedule { @@ -193,10 +193,14 @@ ir::LoweredFunc UpdateFuncWithNewBody(const cinn::common::Target& target, std::unordered_set GetReduceLoopVarNames(const ir::Expr block) { const ir::ScheduleBlockRealize* block_realize = block.As(); - CHECK_NOTNULL(block_realize); + PADDLE_ENFORCE_NOT_NULL( + block_realize, + phi::errors::InvalidArgument("The block is not a ScheduleBlockRealize")); const ir::ScheduleBlock* block_node = block_realize->schedule_block.As(); - CHECK_NOTNULL(block_node); + PADDLE_ENFORCE_NOT_NULL( + block_node, + phi::errors::InvalidArgument("The block is not a ScheduleBlock")); std::vector iter_values = block_realize->iter_values; std::vector iter_vars = block_node->iter_vars; @@ -218,10 +222,14 @@ std::unordered_set GetReduceLoopVarNames(const ir::Expr block) { std::string GetBlockName(const ir::Expr block) { const ir::ScheduleBlockRealize* block_realize = block.As(); - CHECK_NOTNULL(block_realize); + PADDLE_ENFORCE_NOT_NULL( + block_realize, + phi::errors::InvalidArgument("The block is not a ScheduleBlockRealize")); const ir::ScheduleBlock* block_node = block_realize->schedule_block.As(); - CHECK_NOTNULL(block_node); + PADDLE_ENFORCE_NOT_NULL( + block_node, + phi::errors::InvalidArgument("The block is not a ScheduleBlock")); return block_node->name; } diff --git a/paddle/cinn/auto_schedule/auto_tuner.cc b/paddle/cinn/auto_schedule/auto_tuner.cc index d45dcc743e525..9524e1ed3048f 100644 --- a/paddle/cinn/auto_schedule/auto_tuner.cc +++ b/paddle/cinn/auto_schedule/auto_tuner.cc @@ -34,7 +34,7 @@ #include "paddle/cinn/hlir/framework/op.h" #include "paddle/cinn/hlir/framework/visualize_helper.h" #include "paddle/cinn/utils/string.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace auto_schedule { @@ -144,9 +144,10 @@ void PrintResult(const TuningResult& result) { } TuningResult AutoTuner::Tune(const TuningOptions& options) { - CHECK_GT(options.num_tuning_rounds, 0) << "Invalid config"; - VLOG(3) << "Begin tuning with round num=" << options.num_tuning_rounds - << ", tasks size=" << tasks_.size(); + PADDLE_ENFORCE_GT(options.num_tuning_rounds, + 0, + phi::errors::InvalidArgument( + "The num_tuning_rounds should be greater than 0.")); TuningResult result; result.subgraphs.resize(tasks_.size()); diff --git a/paddle/cinn/auto_schedule/cost_model/expr_cost_model.cc b/paddle/cinn/auto_schedule/cost_model/expr_cost_model.cc index a9074c76fa8cf..54396ecaa6e2e 100644 --- a/paddle/cinn/auto_schedule/cost_model/expr_cost_model.cc +++ b/paddle/cinn/auto_schedule/cost_model/expr_cost_model.cc @@ -24,7 +24,7 @@ #include "paddle/cinn/auto_schedule/search_space/search_state.h" #include "paddle/cinn/common/target.h" #include "paddle/cinn/ir/schedule/ir_schedule.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace auto_schedule { @@ -45,8 +45,10 @@ void ExprCostModel::Train(const std::vector& samples, const cinn::common::Target& target) { trained_times_.store(1); size_t total_size = samples.size(); - CHECK_EQ(total_size, labels.size()) - << "Samples must have same size as labels"; + PADDLE_ENFORCE_EQ( + total_size, + labels.size(), + phi::errors::InvalidArgument("Samples must have same size as labels")); std::vector> train_feature_numbers(total_size); FeatureExtractor extractor; for (size_t i = 0; i < total_size; ++i) { @@ -63,8 +65,10 @@ void ExprCostModel::Update(const std::vector& samples, const cinn::common::Target& target) { ++trained_times_; size_t total_size = samples.size(); - CHECK_EQ(total_size, labels.size()) - << "Samples must have same size as labels"; + PADDLE_ENFORCE_EQ( + total_size, + labels.size(), + phi::errors::InvalidArgument("Samples must have same size as labels")); std::vector> train_feature_numbers(total_size); FeatureExtractor extractor; for (size_t i = 0; i < total_size; ++i) { diff --git a/paddle/cinn/auto_schedule/database/database.cc b/paddle/cinn/auto_schedule/database/database.cc index 2036b44a83fef..ee8277b9dadd6 100644 --- a/paddle/cinn/auto_schedule/database/database.cc +++ b/paddle/cinn/auto_schedule/database/database.cc @@ -22,7 +22,7 @@ #include "paddle/cinn/auto_schedule/task/task_registry.h" #include "paddle/cinn/ir/schedule/ir_schedule.h" #include "paddle/cinn/ir/schedule/schedule_desc.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace auto_schedule { @@ -42,8 +42,10 @@ proto::TuningRecord TuningRecord::ToProto() const { Database::Database(int capacity_per_task) : capacity_per_task_(capacity_per_task) { - CHECK_GT(capacity_per_task_, 0) - << "capacity_per_task_ should be greater than 0"; + PADDLE_ENFORCE_GT(capacity_per_task_, + 0, + phi::errors::InvalidArgument( + "capacity_per_task_ should be greater than 0")); } std::unique_ptr Database::Make(const DatabaseConfig& config) { diff --git a/paddle/cinn/auto_schedule/measure/simple_builder.cc b/paddle/cinn/auto_schedule/measure/simple_builder.cc index 5be5b8528616f..0636cfc2b79fa 100644 --- a/paddle/cinn/auto_schedule/measure/simple_builder.cc +++ b/paddle/cinn/auto_schedule/measure/simple_builder.cc @@ -13,7 +13,7 @@ // limitations under the License. #include "paddle/cinn/auto_schedule/measure/simple_builder.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace auto_schedule { @@ -25,8 +25,10 @@ SimpleBuilder::SimpleBuilder(hlir::framework::GraphCompiler* graph_compiler) : graph_compiler_(graph_compiler) {} BuildResult SimpleBuilder::Build(const MeasureInput& input) { - CHECK_NE(graph_compiler_, static_cast(nullptr)) - << "empty handle to GraphCompiler"; + PADDLE_ENFORCE_NE( + graph_compiler_, + static_cast(nullptr), + phi::errors::InvalidArgument("empty handle to GraphCompiler")); CompilationContext& context = graph_compiler_->GetCompilationContext(); context.groups.emplace_back(input.task->subgraph); context.lowered_funcs.emplace_back(input.lowered_funcs); diff --git a/paddle/cinn/auto_schedule/measure/simple_runner.cc b/paddle/cinn/auto_schedule/measure/simple_runner.cc index 92dcc00693b5b..ec3929aff71ae 100644 --- a/paddle/cinn/auto_schedule/measure/simple_runner.cc +++ b/paddle/cinn/auto_schedule/measure/simple_runner.cc @@ -25,7 +25,7 @@ #include "paddle/cinn/hlir/framework/buffer.h" #include "paddle/cinn/hlir/framework/scope.h" #include "paddle/cinn/hlir/framework/tensor.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace auto_schedule { @@ -76,8 +76,11 @@ static void PopulateRandomValue(const cinn::common::Type& type, std::generate_n( fmt_ptr, numel, [&engine, &dist]() { return dist(engine); }); } else { - CHECK_EQ(type.bytes(), 8) - << "Unsupported type: " << type << ", type.bytes = " << type.bytes(); + PADDLE_ENFORCE_EQ( + type.bytes(), + 8, + phi::errors::Unimplemented("Unsupported type, the type.bytes is %d", + type.bytes())); auto* fmt_ptr = reinterpret_cast(raw_ptr); std::uniform_int_distribution dist( std::numeric_limits::min(), @@ -127,7 +130,12 @@ static std::unordered_set ParamsNeedInitWithZero( std::vector param_idxs = kInitWithZeroParams.at(node->op()->name); const auto& inlinks = node->inlinks_in_order(); for (int param_idx : param_idxs) { - CHECK_GT(inlinks.size(), param_idx); + PADDLE_ENFORCE_GT(inlinks.size(), + param_idx, + phi::errors::InvalidArgument( + "The input size of the node is less than the " + "index of the parameter that needs to be " + "initialized to 0")); auto& edge = inlinks.at(param_idx); std::string param_name = edge->source()->as()->id(); @@ -141,7 +149,10 @@ static std::unordered_set ParamsNeedInitWithZero( } SimpleRunner::SimpleRunner(int repeat_times) : repeat_times_(repeat_times) { - CHECK_GT(repeat_times_, 0) << "repeat_times can't less than 0"; + PADDLE_ENFORCE_GT( + repeat_times_, + 0, + phi::errors::InvalidArgument("repeat_times should be greater than 0")); } // Prepare execution arguments of all instructions to run, a argument diff --git a/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.cc b/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.cc index 2e3c4b0e21661..ffc8a0f21d903 100644 --- a/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.cc +++ b/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.cc @@ -18,7 +18,7 @@ #include "paddle/cinn/ir/ir_printer.h" #include "paddle/cinn/ir/schedule/ir_schedule.h" #include "paddle/cinn/ir/schedule/schedule_desc.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace auto_schedule { @@ -29,7 +29,10 @@ int ExtractNumThreads(const ir::IRSchedule& ir_schedule, if (step.type == "Bind" && step.attrs.find("thread_axis") != step.attrs.end() && absl::get(step.attrs.at("thread_axis")) == bind_axis) { - CHECK_EQ(step.inputs.at("loop").size(), 1); + PADDLE_ENFORCE_EQ(step.inputs.at("loop").size(), + 1, + phi::errors::InvalidArgument( + "The loop size of bind step should be 1")); return step.inputs.at("loop")[0].As()->extent.as_int32(); } } diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc index e59ba8b423293..523763942c64e 100644 --- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc +++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc @@ -21,7 +21,7 @@ #include "paddle/cinn/ir/schedule_block_graph.h" #include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/ir/utils/ir_nodes_collector.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace auto_schedule { @@ -40,8 +40,11 @@ bool IsSpatialLoop(const ir::For* for_node) { const auto* schedule_block = block_realize->schedule_block.As(); CHECK(schedule_block) << "schedule_block field is not a ScheduleBlock"; - CHECK_EQ(block_realize->iter_values.size(), - schedule_block->iter_vars.size()); + PADDLE_ENFORCE_EQ( + block_realize->iter_values.size(), + schedule_block->iter_vars.size(), + phi::errors::InvalidArgument( + "The size of iter_values and iter_vars should be equal.")); for (int i = 0; i < block_realize->iter_values.size(); ++i) { const ir::Var& iter_var = schedule_block->iter_vars[i]; const ir::Expr& binding = block_realize->iter_values[i]; @@ -93,10 +96,16 @@ void BindGPUIndex(ir::IRSchedule* ir_schedule, int max_blocks, int max_threads_per_block) { auto all_loops = ir_schedule->GetLoops(block_name); - CHECK_LE(num_loops_to_bind, all_loops.size()) - << "The number of loops to be bind is greater than size of all_loops"; - CHECK_GE(num_loops_to_bind, 0) - << "The number of loops to be bind should be greater than 0"; + PADDLE_ENFORCE_LE( + num_loops_to_bind, + all_loops.size(), + phi::errors::InvalidArgument( + "The number of loops to be bind is greater than size of all_loops")); + PADDLE_ENFORCE_GE( + num_loops_to_bind, + 0, + phi::errors::InvalidArgument( + "The number of loops to be bind should be greater than 0")); // check whether it is the case that threadIdx has been binded but blockIdx // not, the threadIdx can only be binded in the first loop after // num_loops_to_bind loops because we has excluded other cases in @@ -130,13 +139,19 @@ void BindGPUIndex(ir::IRSchedule* ir_schedule, if (extent <= max_blocks * max_threads_per_block) { auto splits = ir_schedule->Split(fused_loop, {-1, max_threads_per_block}); - CHECK_EQ(splits.size(), 2); + PADDLE_ENFORCE_EQ( + splits.size(), + 2, + phi::errors::InvalidArgument("The size of splits should be 2.")); ir_schedule->Bind(splits[0], "blockIdx.x"); ir_schedule->Bind(splits[1], "threadIdx.x"); } else { auto splits = ir_schedule->Split(fused_loop, {-1, max_blocks, max_threads_per_block}); - CHECK_EQ(splits.size(), 3); + PADDLE_ENFORCE_EQ( + splits.size(), + 3, + phi::errors::InvalidArgument("The size of splits should be 3.")); ir_schedule->Reorder({splits[1], splits[2], splits[0]}); all_loops = ir_schedule->GetLoops(block_name); ir_schedule->Bind(all_loops[0], "blockIdx.x"); @@ -160,8 +175,11 @@ RuleApplyType AutoBind::Init(ir::IRSchedule* ir_schedule) { } void AutoBind::Apply(int index) { - CHECK_LT(index, applicable_schedule_blocks_.size()) - << "invalid apply index:" << index; + PADDLE_ENFORCE_LT( + index, + applicable_schedule_blocks_.size(), + phi::errors::InvalidArgument( + "The index should be less than size of applicable_schedule_blocks_")); auto applied_block = applicable_schedule_blocks_.at(index); auto all_loops = ir_schedule_->GetLoops(applied_block); BindGPUIndex(ir_schedule_, diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.cc index e52d91c125224..ef0dbef492a59 100644 --- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.cc +++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.cc @@ -20,7 +20,7 @@ #include "paddle/cinn/common/target.h" #include "paddle/cinn/ir/schedule/ir_schedule.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace auto_schedule { @@ -28,16 +28,19 @@ AutoGenRule::AutoGenRule(const cinn::common::Target& target) : target_(&target) {} int AutoGenRule::NumberApplicable() const { - CHECK_GE(num_applicable_, 0) - << "Call " << GetRuleName() - << "::NumberApplicable() without initialization."; + PADDLE_ENFORCE_GE( + num_applicable_, + 0, + phi::errors::InvalidArgument( + "The num_applicable_ should be greater than or equal to 0.")); return num_applicable_; } void AutoGenRule::ApplyRandomly() { - CHECK_GT(num_applicable_, 0) - << "Call " << GetRuleName() - << "::ApplyRandomly() with NumberApplicable() == 0"; + PADDLE_ENFORCE_GT(num_applicable_, + 0, + phi::errors::InvalidArgument( + "The num_applicable_ should be greater than 0.")); int index = rand() % num_applicable_; // NOLINT return Apply(index); } diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc index c052d2995c8ad..a4ecd5036e2e7 100644 --- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc +++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc @@ -22,7 +22,7 @@ #include "paddle/cinn/ir/schedule/ir_schedule.h" #include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/ir/utils/ir_nodes_collector.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace auto_schedule { @@ -97,8 +97,9 @@ RuleApplyType AutoUnroll::Init(ir::IRSchedule* ir_schedule) { } void AutoUnroll::Apply(int index) { - CHECK_LT(index, applicable_schedule_blocks_.size()) - << "invalid apply index:" << index; + PADDLE_ENFORCE_LT(index, + applicable_schedule_blocks_.size(), + phi::errors::InvalidArgument("Index is out of range.")); auto applied_block = applicable_schedule_blocks_.at(index); int max_step = auto_unroll_options[std::rand() % auto_unroll_options.size()]; ir_schedule_->Annotate( diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h index 1bbc8da4497d6..759dbfa54d3a4 100644 --- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h +++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h @@ -27,7 +27,7 @@ #include "paddle/cinn/ir/ir.h" #include "paddle/cinn/ir/ir_base.h" #include "paddle/cinn/ir/schedule/ir_schedule.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace auto_schedule { @@ -103,8 +103,11 @@ class MultiLevelTiling : public AutoGenRule { // Sample num_split integers whose product equals extent template std::vector SampleTileSplit(T extent, int num_split) const { - CHECK_GT(num_split, 0) - << "num_split in SampleTileSplit must be greater than 0"; + PADDLE_ENFORCE_GT( + num_split, + 0, + phi::errors::InvalidArgument( + "num_split in SampleTileSplit must be greater than 0")); if (num_split == 1) { return {extent}; } diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.cc index 85bc207c84fc7..0053c87a81394 100644 --- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.cc +++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.cc @@ -23,7 +23,7 @@ #include "paddle/cinn/ir/tensor.h" #include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/ir/utils/ir_nodes_collector.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace auto_schedule { @@ -32,10 +32,16 @@ bool ReductionFactoring::CanApply(const std::string& block_name, ir::Expr block_expr = ir_schedule->GetBlock(block_name); ir::ScheduleBlockRealize* block_realize = block_expr.As(); - CHECK_NOTNULL(block_realize); + PADDLE_ENFORCE_NOT_NULL( + block_realize, + phi::errors::InvalidArgument( + "The block_expr should be a ScheduleBlockRealize.")); ir::ScheduleBlock* sch_block = block_realize->schedule_block.As(); - CHECK_NOTNULL(sch_block); + PADDLE_ENFORCE_NOT_NULL( + sch_block, + phi::errors::InvalidArgument( + "The schedule_block field is not a ScheduleBlock.")); AnalyzeScheduleBlockReadWriteBuffer(sch_block); // 1. The block must have write buffer @@ -135,7 +141,11 @@ void ReductionFactoring::Apply(const std::string& block_name, return; } // 3. Reorder if new_loop_order differs from the original order - CHECK_EQ(all_loops.size(), new_loop_order.size()); + PADDLE_ENFORCE_EQ( + all_loops.size(), + new_loop_order.size(), + phi::errors::InvalidArgument("The size of all_loops should be equal to " + "the size of new_loop_order.")); for (int i = 0; i < all_loops.size(); ++i) { if (all_loops[i].As()->loop_var->name != new_loop_order[i].As()->loop_var->name) { @@ -152,7 +162,11 @@ void ReductionFactoring::Apply(const std::string& block_name, for (int i = num_spatial_loops; i < all_loops.size(); ++i) { reduction_loop_indices.push_back(i); } - CHECK_EQ(reduction_loop_indices.size(), num_reduction_loops); + PADDLE_ENFORCE_EQ(reduction_loop_indices.size(), + num_reduction_loops, + phi::errors::InvalidArgument( + "The size of reduction_loop_indices should be equal " + "to num_reduction_loops.")); fused_reduce_loop = ir_schedule->Fuse(block_name, reduction_loop_indices); } else { all_loops = ir_schedule->GetLoops(block_name); diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring_test.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring_test.cc index d56d97f83df60..fb327c130dbbf 100644 --- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring_test.cc +++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring_test.cc @@ -23,8 +23,8 @@ #include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h" #include "paddle/cinn/ir/ir_printer.h" +#include "paddle/common/enforce.h" #include "test/cpp/cinn/concrete_program_builder.h" - PD_DECLARE_bool(cinn_new_group_scheduler); namespace cinn { @@ -64,8 +64,13 @@ class TestReductionFactoring : public TestAutoGenRuleBase { // check const std::vector& blocks = ir_schedule.GetAllBlocks(); - CHECK_EQ(blocks.size(), 2UL); - CHECK_EQ(ir.str(), expected_ir); + PADDLE_ENFORCE_EQ( + blocks.size(), + 2UL, + phi::errors::InvalidArgument("The size of blocks should be 2.")); + PADDLE_ENFORCE_EQ(ir.str(), + expected_ir, + phi::errors::InvalidArgument("The ir is not correct.")); } }; diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc index 994027dba0ee4..66d25c65542d1 100644 --- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc +++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc @@ -18,7 +18,6 @@ #include #include #include - #include "paddle/cinn/auto_schedule/analysis/analyze_ir.h" #include "paddle/cinn/backends/codegen_cuda_dev.h" #include "paddle/cinn/cinn.h" @@ -29,6 +28,7 @@ #include "paddle/cinn/hlir/framework/pass.h" #include "paddle/cinn/hlir/framework/tensor.h" #include "paddle/cinn/optim/transform_gpu_forloop.h" +#include "paddle/common/enforce.h" #ifdef CINN_WITH_CUDA #include #endif @@ -89,8 +89,10 @@ std::string TestAutoGenRuleBase::GetIR(const ir::IRSchedule& schedule) { ir::Module TestAutoGenRuleBase::BuildIRModule(const ir::IRSchedule& schedule) { auto&& updated_bodys = schedule.GetModule().GetExprs(); - CHECK_EQ(lowered_funcs_.size(), updated_bodys.size()) - << "associated exprs size not equal"; + PADDLE_ENFORCE_EQ( + lowered_funcs_.size(), + updated_bodys.size(), + phi::errors::InvalidArgument("Associated exprs size not equal")); ir::Module::Builder builder("test_builder", this->target_); for (int i = 0; i < lowered_funcs_.size(); ++i) { @@ -175,10 +177,16 @@ void CheckResult(raw_func_type test_func, const cinn::common::Target& target) { CHECK(input_names.size()) << "The number of inputs must be greater than 0."; CHECK(output_names.size()) << "The number of outputs must be greater than 0."; - CHECK_EQ(input_names.size(), input_shapes.size()) - << "The quantity of input_names and input_shapes must be equal."; - CHECK_EQ(output_names.size(), output_shapes.size()) - << "The quantity of output_names and output_shapes must be equal."; + PADDLE_ENFORCE_EQ( + input_names.size(), + input_shapes.size(), + phi::errors::InvalidArgument( + "The quantity of input_names and input_shapes must be equal.")); + PADDLE_ENFORCE_EQ( + output_names.size(), + output_shapes.size(), + phi::errors::InvalidArgument( + "The quantity of output_names and output_shapes must be equal.")); // Initialize data std::vector input_data_ptrs(input_names.size()); diff --git a/paddle/cinn/auto_schedule/search_space/block_sampler.cc b/paddle/cinn/auto_schedule/search_space/block_sampler.cc index 93de31e6a5e36..38d3b7badd02a 100644 --- a/paddle/cinn/auto_schedule/search_space/block_sampler.cc +++ b/paddle/cinn/auto_schedule/search_space/block_sampler.cc @@ -17,7 +17,7 @@ #include #include "paddle/cinn/ir/ir.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace auto_schedule { @@ -27,7 +27,10 @@ std::unique_ptr BlockSampler::Make( const std::string& strategy, utils::LinearRandomEngine::StateType rand_seed, const std::vector& weights) { - CHECK_GT(all_blocks.size(), 0) << "Empty block list"; + PADDLE_ENFORCE_GT( + all_blocks.size(), + 0, + phi::errors::InvalidArgument("The all_blocks should not empty.")); if (strategy == "traversal") { VLOG(6) << "Init TraversalBlockSampler with block num = " << all_blocks.size(); @@ -87,7 +90,11 @@ ProbabilisticBlockSampler::ProbabilisticBlockSampler( if (weights.empty()) { weights_.resize(all_blocks.size(), 1); } else { - CHECK_EQ(all_blocks.size(), weights_.size()); + PADDLE_ENFORCE_EQ( + all_blocks.size(), + weights_.size(), + phi::errors::InvalidArgument( + "The size of all_blocks and weights should be equal.")); } remains_ = all_blocks.size(); } diff --git a/paddle/cinn/auto_schedule/search_space/rule_sampler.cc b/paddle/cinn/auto_schedule/search_space/rule_sampler.cc index 3c0868d0748e5..bd8e818546a91 100644 --- a/paddle/cinn/auto_schedule/search_space/rule_sampler.cc +++ b/paddle/cinn/auto_schedule/search_space/rule_sampler.cc @@ -16,7 +16,7 @@ #include #include - +#include "paddle/common/enforce.h" namespace cinn { namespace auto_schedule { @@ -26,7 +26,10 @@ std::unique_ptr RuleSampler::Make( const std::string& strategy, utils::LinearRandomEngine::StateType rand_seed, const std::vector& weights) { - CHECK_GT(potential_rules.size(), 0) << "Empty rule list"; + PADDLE_ENFORCE_GT( + potential_rules.size(), + 0, + phi::errors::InvalidArgument("The potential_rules should not be empty.")); if (strategy == "traversal") { return std::make_unique(potential_rules, default_remove_policy); @@ -64,7 +67,11 @@ ProbabilisticRuleSampler::ProbabilisticRuleSampler( if (weights.empty()) { weights_.resize(potential_rules.size(), 1); } else { - CHECK_EQ(potential_rules.size(), weights_.size()); + PADDLE_ENFORCE_EQ( + potential_rules.size(), + weights_.size(), + phi::errors::InvalidArgument( + "Potential_rules's size should same as weights's size.")); } remains_ = potential_rules.size(); } diff --git a/paddle/cinn/auto_schedule/search_space/search_space.cc b/paddle/cinn/auto_schedule/search_space/search_space.cc index 650e1d572f831..a4f4db6472e1b 100644 --- a/paddle/cinn/auto_schedule/search_space/search_space.cc +++ b/paddle/cinn/auto_schedule/search_space/search_space.cc @@ -33,7 +33,7 @@ #include "paddle/cinn/ir/schedule/ir_schedule.h" #include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/runtime/flags.h" - +#include "paddle/common/enforce.h" PD_DECLARE_bool(auto_schedule_use_cost_model); namespace cinn { @@ -109,7 +109,10 @@ SearchState SearchSpace::RandomScheduleMutate(const SearchState& state) { --iter; int sample_rule_index = iter->second; - CHECK_LT(sample_rule_index, ret->applicable_rules.size()); + PADDLE_ENFORCE_LT(sample_rule_index, + ret->applicable_rules.size(), + phi::errors::InvalidArgument( + "The sample_rule_index should less than ret's.")); AutoGenRule* sample_rule = ret->applicable_rules.at(sample_rule_index); VLOG(7) << "Apply rule: " << sample_rule->GetRuleName() << " with index=" << sample_weighted_index - iter->first; diff --git a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc index dcb6e1ca93914..6403283f18be1 100644 --- a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc +++ b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc @@ -35,7 +35,7 @@ #include "paddle/cinn/utils/multi_threading.h" #include "paddle/cinn/utils/sized_multi_set.h" #include "paddle/cinn/utils/string.h" - +#include "paddle/common/enforce.h" PD_DECLARE_bool(auto_schedule_use_cost_model); namespace cinn { @@ -175,9 +175,11 @@ SearchState EvolutionarySearch::CrossOver(const SearchState& state1, std::vector mother_exprs = state2->ir_schedule.GetModule().GetExprs(); - CHECK_EQ(father_exprs.size(), mother_exprs.size()) - << "CrossOver ModuleExpr in EvolutionarySearch must have same number of " - "AST"; + PADDLE_ENFORCE_EQ(father_exprs.size(), + mother_exprs.size(), + phi::errors::InvalidArgument( + "CrossOver ModuleExpr in EvolutionarySearch must have " + "same number of AST")); for (size_t i = 0; i < father_exprs.size(); ++i) { if (utils::SampleUniformInt(0, 2, &rand_seed_) == 0) { @@ -200,10 +202,15 @@ SearchState EvolutionarySearch::CrossOver(const SearchState& state1, SearchState EvolutionarySearch::Mutate( const SearchState& state, utils::LinearRandomEngine::StateType* rand_seed) { - CHECK_GT(weighted_mutators_.size(), 0) - << "There is no mutate rule can be applied."; + PADDLE_ENFORCE_GT( + weighted_mutators_.size(), + 0, + phi::errors::InvalidArgument("There is no mutate rule can be applied.")); double accu_weight = (weighted_mutators_.rbegin())->first; - CHECK_GT(accu_weight, 0) << "The accumulate weight must be greater than 0."; + PADDLE_ENFORCE_GT(accu_weight, + 0, + phi::errors::InvalidArgument( + "The accumulate weight must be greater than 0.")); // sample a mutate rule double sample_weight = utils::SampleUniformDouble(0, accu_weight, rand_seed); auto sampled_iter = weighted_mutators_.upper_bound(sample_weight); diff --git a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search_test.cc b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search_test.cc index 6a983d7f9aaac..7791cdf9f89d5 100644 --- a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search_test.cc +++ b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search_test.cc @@ -30,8 +30,8 @@ #include "paddle/cinn/hlir/framework/op_lowering.h" #include "paddle/cinn/ir/ir_base.h" #include "paddle/cinn/ir/schedule/ir_schedule.h" +#include "paddle/common/enforce.h" #include "test/cpp/cinn/program_builder.h" - namespace cinn { namespace auto_schedule { @@ -159,7 +159,10 @@ TEST(EvolutionarySearch, Evolve) { auto tasks = CreateTasks( tests::OpBuilder("matmul").Build({{"X", {32, 32}}, {"Y", {32, 32}}}), target); - CHECK_EQ(tasks.size(), 1); + PADDLE_ENFORCE_EQ( + tasks.size(), + 1, + phi::errors::InvalidArgument("The size of tasks should be 1.")); ExprCostModel cost_model; std::vector cost_model_samples(1); std::vector cost_model_labels(1); @@ -206,7 +209,11 @@ TEST(EvolutionarySearch, Evolve) { VLOG(6) << "cost = " << s->predicted_cost; } VLOG(6) << "total_cost_next = " << total_cost_next; - CHECK_LE(total_cost_next, total_cost_pre); + PADDLE_ENFORCE_LE( + total_cost_next, + total_cost_pre, + phi::errors::InvalidArgument("The total cost should be less than or " + "equal to the previous one.")); std::swap(population_pre_ptr, population_next_ptr); } } diff --git a/paddle/cinn/auto_schedule/task/task_optimizer.cc b/paddle/cinn/auto_schedule/task/task_optimizer.cc index 273cba4c4060e..a027dc9dd1ed5 100644 --- a/paddle/cinn/auto_schedule/task/task_optimizer.cc +++ b/paddle/cinn/auto_schedule/task/task_optimizer.cc @@ -18,7 +18,6 @@ #include #include - #include "paddle/cinn/auto_schedule/analysis/analyze_ir.h" #include "paddle/cinn/auto_schedule/cost_model/expr_cost_model.h" #include "paddle/cinn/auto_schedule/measure/measure.h" @@ -34,6 +33,7 @@ #include "paddle/cinn/optim/transform_gpu_forloop.h" #include "paddle/cinn/runtime/flags.h" #include "paddle/cinn/utils/string.h" +#include "paddle/common/enforce.h" #ifdef CINN_WITH_CUDA #include @@ -223,9 +223,12 @@ bool IsWrappedByCustomCall(const TuneTask* task) { TaskOptimizer::Result TaskOptimizer::OptimizeByEvolution( const TuningOptions& options) { - CHECK_EQ(options.num_measure_trials % options.num_samples_per_iteration, 0) - << "TuningOptions.num_measure_trials % " - "TuningOptions.num_samples_per_iteration must be 0."; + PADDLE_ENFORCE_EQ( + options.num_measure_trials % options.num_samples_per_iteration, + 0, + phi::errors::InvalidArgument( + "TuningOptions.num_measure_trials % " + "TuningOptions.num_samples_per_iteration must be 0.")); VLOG(4) << "Optimizing TuneTask with num_measure_trials:" << options.num_measure_trials @@ -290,9 +293,11 @@ TaskOptimizer::Result TaskOptimizer::OptimizeByEvolution( << measure_inputs.size(); std::vector measure_outputs = schedule_measurer_->Measure(measure_inputs); - CHECK_EQ(measure_outputs.size(), states.size()) - << "ScheduleMeasurer didn't output same number of MeasureOutput of " - "states in TaskOptimizer"; + PADDLE_ENFORCE_EQ(measure_outputs.size(), + states.size(), + phi::errors::InvalidArgument( + "ScheduleMeasurer didn't output same number of " + "MeasureOutput of states in TaskOptimizer")); // record to database for (size_t i = 0; i < states.size(); ++i) { database_->AddRecord(TuningRecord(measure_inputs[i].task->serialized_key, @@ -344,9 +349,11 @@ std::vector TaskOptimizer::SearchOneRound( for (size_t i = 0; i < states.size(); ++i) { std::vector best_exprs = states[i]->ir_schedule.GetModule().GetExprs(); - CHECK_EQ(best_exprs.size(), task_->lowered_funcs.size()) - << "RuntimeError: Expr size is not equal to LoweredFunc size in " - "TaskOptimizer"; + PADDLE_ENFORCE_EQ(best_exprs.size(), + task_->lowered_funcs.size(), + phi::errors::InvalidArgument( + "Expr size is not equal to LoweredFunc size in " + "TaskOptimizer")); auto init_funcs = ir::ir_utils::IRCopy(task_->lowered_funcs); std::vector valid_funcs; for (size_t j = 0; j < best_exprs.size(); ++j) { @@ -369,8 +376,11 @@ std::vector TaskOptimizer::SearchOneRound( } states.erase(states.begin() + valid_cnt, states.end()); - CHECK_EQ(states.size(), measure_candidates->size()) - << "result size of states not equal to measure_candidates"; + PADDLE_ENFORCE_EQ( + states.size(), + measure_candidates->size(), + phi::errors::InvalidArgument( + "result size of states not equal to measure_candidates")); VLOG(4) << "EvolutionarySearch return size=" << states.size() << ", valid count=" << valid_cnt; VLOG(4) << JoinStatesDebugString("TaskOptimizer::SearchOneRound-Result", diff --git a/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.cc b/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.cc index a8961e45b980d..f59acbe612635 100644 --- a/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.cc +++ b/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.cc @@ -19,7 +19,7 @@ #include "paddle/cinn/auto_schedule/task/tune_task.h" #include "paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.h" #include "paddle/cinn/auto_schedule/task_scheduler/round_robin.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace auto_schedule { @@ -27,7 +27,10 @@ std::unique_ptr TaskScheduler::Make( const std::vector& tasks, const Config& config, const std::string& strategy) { - CHECK_GT(tasks.size(), 0) << "Empty task list"; + PADDLE_ENFORCE_GT( + tasks.size(), + 0, + phi::errors::InvalidArgument("The task's size should greater than 0.")); if (strategy == "round_robin") { return std::make_unique(tasks, config); } else if (strategy == "efficiency_priority") { diff --git a/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc b/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc index 2966467b3eda6..c9f2630ac6e8a 100644 --- a/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc +++ b/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc @@ -32,8 +32,8 @@ #include "paddle/cinn/ir/ir_base.h" #include "paddle/cinn/runtime/flags.h" #include "paddle/cinn/utils/data_util.h" +#include "paddle/common/enforce.h" #include "test/cpp/cinn/program_builder.h" - /* This test is used as a tool to evaluate or compare performance of 3 * schedules(no schedule, manual schedule, auto-schedule). One can specify which * schedules to be evaluated through `FLAGS_evaluate_knobs` and specify which @@ -355,7 +355,10 @@ TEST_F(PerformanceTester, Gather) { // paddle model test TEST_F(PerformanceTester, ResNet50) { - CHECK_NE(FLAGS_resnet50_model_dir, ""); + PADDLE_ENFORCE_NE(FLAGS_resnet50_model_dir, + "", + phi::errors::InvalidArgument( + "The FLAGS_resnet50_model's dir should not be empty.")); FLAGS_cinn_infer_model_version = 1.0; std::unordered_map> feeds = { {"inputs", {batch_size, 3, 224, 224}}}; diff --git a/paddle/cinn/backends/codegen_c.cc b/paddle/cinn/backends/codegen_c.cc index 85443b02c0a8c..07dc8421de6cc 100644 --- a/paddle/cinn/backends/codegen_c.cc +++ b/paddle/cinn/backends/codegen_c.cc @@ -26,7 +26,7 @@ #include "paddle/cinn/runtime/cpu/thread_backend.h" #include "paddle/cinn/runtime/intrinsic.h" #include "paddle/cinn/utils/string.h" - +#include "paddle/common/enforce.h" //! Root of the builtin code. PD_DECLARE_string(cinn_x86_builtin_code_root); @@ -205,7 +205,10 @@ void CodeGenC::Visit(const ir::For *op) { Expr num_task_var = Var("num_task"); IrPrinter::Visit((op->extent + num_task_var - 1) / num_task_var); str_ += ";\n"; - CHECK_EQ(min.as_int32(), 0); + PADDLE_ENFORCE_EQ( + min.as_int32(), + 0, + phi::errors::InvalidArgument("The min of the for loop should be 0")); auto task_id = Var("task_id"); auto n_per_task = Var("n_per_task"); min = task_id * n_per_task; @@ -370,7 +373,10 @@ void CodeGenC::PrintCallArgs(const ir::Call *op) { } void CodeGenC::PrintCall_buffer_malloc(const ir::Call *op) { - CHECK_EQ(op->read_args.size(), 2UL); + PADDLE_ENFORCE_EQ( + op->read_args.size(), + 2UL, + phi::errors::InvalidArgument("The number of read_args should be 2")); str_ += op->name; str_ += "("; PrintCastExpr("void*", op->read_args[0]); @@ -380,7 +386,10 @@ void CodeGenC::PrintCall_buffer_malloc(const ir::Call *op) { } void CodeGenC::PrintCall_cinn_pod_value_to_(const ir::Call *op) { - CHECK_EQ(op->read_args.size(), 1UL); + PADDLE_ENFORCE_EQ( + op->read_args.size(), + 1UL, + phi::errors::InvalidArgument("The number of read_args should be 1")); str_ += op->name; str_ += "("; str_ += "&("; @@ -390,7 +399,10 @@ void CodeGenC::PrintCall_cinn_pod_value_to_(const ir::Call *op) { } void CodeGenC::PrintCall_get_address(const ir::Call *op) { - CHECK_EQ(op->read_args.size(), 1UL); + PADDLE_ENFORCE_EQ( + op->read_args.size(), + 1UL, + phi::errors::InvalidArgument("The number of read_args should be 1")); CHECK(op->write_args.empty()); auto *read_var = op->read_args.front().as_var(); auto *read_buf = op->read_args.front().as_buffer(); @@ -409,7 +421,10 @@ void CodeGenC::PrintCall_get_address(const ir::Call *op) { void CodeGenC::PrintCall_pod_values_to_array(const ir::Call *op) { CHECK(!op->read_args.empty()); - CHECK_EQ(op->write_args.size(), 1UL); + PADDLE_ENFORCE_EQ( + op->write_args.size(), + 1UL, + phi::errors::InvalidArgument("The number of write_args should be 1")); auto output_var = op->write_args.front().as_var_ref(); CHECK(output_var.defined()); @@ -612,9 +627,12 @@ void CodeGenC::Visit(const ir::_LoweredFunc_ *op) { DoIndent(); - CHECK_EQ(op->alloc_output_buffer_exprs.size(), - op->dealloc_output_buffer_exprs.size()) - << "the count of allocation and deallocation expressions is not match"; + PADDLE_ENFORCE_EQ( + op->alloc_output_buffer_exprs.size(), + op->dealloc_output_buffer_exprs.size(), + phi::errors::InvalidArgument( + "The count of allocation and deallocation expressions is not " + "match")); std::vector new_body; diff --git a/paddle/cinn/backends/codegen_c_x86.cc b/paddle/cinn/backends/codegen_c_x86.cc index 394b61e35816d..06a9ff1fda2f9 100644 --- a/paddle/cinn/backends/codegen_c_x86.cc +++ b/paddle/cinn/backends/codegen_c_x86.cc @@ -13,7 +13,7 @@ // limitations under the License. #include "paddle/cinn/backends/codegen_c_x86.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace backends { @@ -53,7 +53,11 @@ void CodeGenCX86::Visit(const ir::Load *op) { } void CodeGenCX86::Visit(const ir::Broadcast *op) { - CHECK_GT(op->type().lanes(), 1); + PADDLE_ENFORCE_GT( + op->type().lanes(), + 1, + phi::errors::InvalidArgument( + "The lanes of the broadcast op should be greater than 1.")); int bits = op->type().bits() * op->type().lanes(); if (SupportsAVX512() && bits == 512) { diff --git a/paddle/cinn/backends/codegen_c_x86.h b/paddle/cinn/backends/codegen_c_x86.h index f0b040a94f1ae..bf90612292d20 100644 --- a/paddle/cinn/backends/codegen_c_x86.h +++ b/paddle/cinn/backends/codegen_c_x86.h @@ -18,7 +18,7 @@ #include "paddle/cinn/backends/codegen_c.h" #include "paddle/cinn/ir/intrinsic_ops.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace backends { @@ -114,8 +114,10 @@ void CodeGenCX86::VisitBinaryOp(const Op *op, Expr a, Expr b, const std::string &op_repr) { - CHECK_EQ(a.type(), b.type()) << " a is : " << a << ", and b is : " << b - << ". op_repr is : " << op_repr; + PADDLE_ENFORCE_EQ( + a.type(), + b.type(), + phi::errors::InvalidArgument("The type of a and b should be the same.")); // scalar. if (a.type().lanes() == 1) { diff --git a/paddle/cinn/backends/codegen_cuda_dev.cc b/paddle/cinn/backends/codegen_cuda_dev.cc index 9c19c6faffb73..919edfc680ca7 100644 --- a/paddle/cinn/backends/codegen_cuda_dev.cc +++ b/paddle/cinn/backends/codegen_cuda_dev.cc @@ -26,8 +26,8 @@ #include "paddle/cinn/ir/op/ir_operators.h" #include "paddle/cinn/ir/utils/ir_verify.h" #include "paddle/cinn/optim/ir_simplify.h" +#include "paddle/common/enforce.h" #include "paddle/common/errors.h" - namespace cinn { namespace backends { @@ -122,7 +122,8 @@ std::vector FilterDeallocTempBuffers(const std::vector &frees) { std::vector filtered; for (const Expr &free : frees) { const ir::Free *op = free.As(); - CHECK_NOTNULL(op); + PADDLE_ENFORCE_NOT_NULL( + op, phi::errors::InvalidArgument("Free is not a free node")); bool has_symbolic_constant = false; const ir::_Buffer_ *buffer = op->destination.As(); for (Expr shape : buffer->shape) { @@ -305,7 +306,10 @@ std::string CodeGenCUDA_Dev::Compile(const ir::Module &module, void CodeGenCUDA_Dev::PrintIncludes() { str_ += GetSourceHeader(); } void CodeGenCUDA_Dev::PrintTempBufferCreation(const ir::Buffer &buffer) { - CHECK_NE(buffer->type(), Void()); + PADDLE_ENFORCE_NE( + buffer->type(), + Void(), + phi::errors::InvalidArgument("buffer type should not be void")); // Calculate buffer size and determine if it contains a symbolic constant Expr buffer_size(1); for (int i = 0; i < buffer->shape.size(); i++) { diff --git a/paddle/cinn/backends/codegen_cuda_host.cc b/paddle/cinn/backends/codegen_cuda_host.cc index b888db7c7c726..1ba4714153395 100644 --- a/paddle/cinn/backends/codegen_cuda_host.cc +++ b/paddle/cinn/backends/codegen_cuda_host.cc @@ -23,7 +23,7 @@ #include "paddle/cinn/backends/extern_func_jit_register.h" #include "paddle/cinn/backends/llvm/llvm_util.h" #include "paddle/cinn/runtime/intrinsic.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace backends { @@ -65,10 +65,22 @@ llvm::Value* CodeGenCUDA_Host::LowerGPUKernelLauncher( llvm::Value* kernel_stream = nullptr; if (ll_function_args.size() == 3) { kernel_stream = ll_function_args[2]; - CHECK_EQ(kernel_stream->getType(), ll_void_p_ty()); // void* stream + PADDLE_ENFORCE_EQ( + kernel_stream->getType(), + ll_void_p_ty(), + phi::errors::InvalidArgument( + "The type of kernel_stream should be void*")); // void* stream } - CHECK_EQ(kernel_args->getType(), ll_void_p_ty()); // void* args - CHECK_EQ(kernel_args_count->getType(), ll_int32_ty()); // int32 + PADDLE_ENFORCE_EQ( + kernel_args->getType(), + ll_void_p_ty(), + phi::errors::InvalidArgument( + "The type of kernel_args should be void*")); // void* args + PADDLE_ENFORCE_EQ( + kernel_args_count->getType(), + ll_int32_ty(), + phi::errors::InvalidArgument( + "The type of kernel_args_count should be int32")); // int32 std::unordered_map global_args = { {KERNEL_ARGS, kernel_args}, @@ -199,7 +211,11 @@ llvm::Value* CodeGenCUDA_Host::LowerHostFunc(const ir::_LoweredFunc_* func) { // @} // Set local scope table - CHECK_EQ(ll_function_args.size(), func->args.size()); + PADDLE_ENFORCE_EQ(ll_function_args.size(), + func->args.size(), + phi::errors::InvalidArgument( + "The number of arguments is not equal to the number of " + "function arguments")); for (int i = 0; i < ll_function_args.size(); ++i) { SetVar(func->args[i].name(), ll_function_args[i]); } @@ -224,7 +240,11 @@ llvm::Value* CodeGenCUDA_Host::LowerParseArgsValueCall( const ir::Call* call_ir) { auto ret_type = CinnTypeToLLVMType(Int(64), m_); std::vector args_type; - CHECK_EQ(call_ir->read_args.size(), 2); + PADDLE_ENFORCE_EQ( + call_ir->read_args.size(), + 2, + phi::errors::InvalidArgument( + "The number of arguments of ParseArgsValue should be 2")); CHECK(call_ir->read_args[0].is_var() && call_ir->read_args[0].as_var()->type().is_cpp_handle()); CHECK(call_ir->read_args[1].type().is_int(32)); @@ -251,10 +271,22 @@ llvm::Value* CodeGenCUDA_Host::LowerCUDAKernelCall(const ir::Call* call_ir) { llvm::Value* kernel_stream = nullptr; if (ll_function_args.size() == 3) { kernel_stream = ll_function_args[2]; - CHECK_EQ(kernel_stream->getType(), ll_void_p_ty()); // void* stream + PADDLE_ENFORCE_EQ( + kernel_stream->getType(), + ll_void_p_ty(), + phi::errors::InvalidArgument( + "The type of kernel_stream should be void*")); // void* stream } - CHECK_EQ(kernel_args->getType(), ll_void_p_ty()); // void* args - CHECK_EQ(kernel_args_count->getType(), ll_int32_ty()); // int32 + PADDLE_ENFORCE_EQ( + kernel_args->getType(), + ll_void_p_ty(), + phi::errors::InvalidArgument( + "The type of kernel_args should be void*")); // void* args + PADDLE_ENFORCE_EQ( + kernel_args_count->getType(), + ll_int32_ty(), + phi::errors::InvalidArgument( + "The type of kernel_args_count should be int32")); // int32 std::unordered_map global_args = { {KERNEL_ARGS, kernel_args}, diff --git a/paddle/cinn/backends/codegen_device_util.cc b/paddle/cinn/backends/codegen_device_util.cc index 3373ed15e3bec..91c18ea35e9ea 100644 --- a/paddle/cinn/backends/codegen_device_util.cc +++ b/paddle/cinn/backends/codegen_device_util.cc @@ -68,6 +68,18 @@ std::string Predicate2String(ir::Expr predicate) { return ss.str(); } +static std::string CurTailFnName(const std::string &origin_fn_name) { + const int MaxStrLength = 16383; + if (origin_fn_name.length() <= MaxStrLength) { + return origin_fn_name; + } + VLOG(6) << "Funtion name too long. Curtail and concat hash."; + const std::string new_fn_name = + origin_fn_name.substr(0, MaxStrLength) + + std::to_string(std::hash()(origin_fn_name)); + return new_fn_name; +} + std::string detail::CollectBucketStrategyHostFunctionVisitor::GenDeviceKernelName( const std::string &fn_name, ir::Expr predicate) { @@ -80,7 +92,10 @@ detail::CollectBucketStrategyHostFunctionVisitor::GenDeviceKernelName( pos = cond_str.find("-", pos + replacement.length()); } VLOG(3) << "predicate string: " << cond_str; - return fn_name + "__COND_" + cond_str + "__kernel"; + // NOTE(chenxi67): The kernel name is too long to be supported in cuda12.3 so + // we need to curtail it. + const std::string new_fn_name = CurTailFnName(fn_name); + return new_fn_name + "__COND_" + cond_str + "__kernel"; } void detail::CollectBucketStrategyHostFunctionVisitor::ProcessLoweredFunc( diff --git a/paddle/cinn/backends/codegen_device_util.h b/paddle/cinn/backends/codegen_device_util.h index caada3153e63b..ff3114c71296b 100644 --- a/paddle/cinn/backends/codegen_device_util.h +++ b/paddle/cinn/backends/codegen_device_util.h @@ -27,7 +27,7 @@ #include "paddle/cinn/ir/ir_mutator.h" #include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/runtime/flags.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace backends { @@ -205,7 +205,11 @@ struct CollectBucketStrategyHostFunctionVisitor if (op->functions.size() == 1 && op->predicates.size() == 0) { expr->as_module()->predicates.push_back(ir::Expr(true)); } - CHECK_EQ(op->functions.size(), op->predicates.size()); + PADDLE_ENFORCE_EQ( + op->functions.size(), + op->predicates.size(), + phi::errors::InvalidArgument( + "The size of functions and predicates should be equal")); for (int i = 0; i < op->functions.size(); ++i) { ProcessLoweredFunc(op->functions[i], op->predicates[i]); if (i == 0) { diff --git a/paddle/cinn/backends/compiler.cc b/paddle/cinn/backends/compiler.cc index 4f02a35411413..72678eec44c22 100644 --- a/paddle/cinn/backends/compiler.cc +++ b/paddle/cinn/backends/compiler.cc @@ -230,15 +230,23 @@ void SourceCodePrint::write(const std::string& source_code) { } } -void Compiler::Build(const Module& module, const std::string& code) { - auto PatternMatch = - adt::match{[&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; }, - [&](common::X86Arch) { CompileX86Module(module); }, - [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; }, - [&](common::NVGPUArch) { CompileCudaModule(module, code); }}; +void Compiler::Build(const Module& module, + const std::string& code, + const bool end) { + auto PatternMatch = adt::match{ + [&](common::UnknownArch) { CINN_NOT_IMPLEMENTED; }, + [&](common::X86Arch) { CompileX86Module(module, end); }, + [&](common::ARMArch) { CINN_NOT_IMPLEMENTED; }, + [&](common::NVGPUArch) { CompileCudaModule(module, code, end); }}; return std::visit(PatternMatch, target_.arch.variant()); } +void Compiler::AppendCX86(const Module& module) { + VLOG(3) << "Start Compiler::BuildCX86" << module; + CompileX86Module(module, true); + VLOG(3) << "Over Compiler::BuildCX86"; +} + std::string Compiler::GetSourceCode(const ir::Module& module) { return target_.arch.Visit(adt::match{ [&](common::UnknownArch) -> std::string { CINN_NOT_IMPLEMENTED; }, @@ -287,7 +295,8 @@ std::string GetFileContent(const std::string& path) { } // namespace void Compiler::CompileCudaModule(const Module& module, - const std::string& code) { + const std::string& code, + bool add_module) { #ifdef CINN_WITH_CUDA auto _host_module_device_module_ = SplitDeviceAndHostModule(module); // NOLINT @@ -337,15 +346,15 @@ void Compiler::CompileCudaModule(const Module& module, } engine_ = ExecutionEngine::Create(ExecutionOptions(), std::move(symbols)); - engine_->Link(host_module); + engine_->Link(host_module, add_module); #else CINN_NOT_IMPLEMENTED #endif } -void Compiler::CompileX86Module(const Module& module) { - engine_->Link(module); +void Compiler::CompileX86Module(const Module& module, bool add_module) { + engine_->Link(module, add_module); } void Compiler::ExportObject(const std::string& path) { diff --git a/paddle/cinn/backends/compiler.h b/paddle/cinn/backends/compiler.h index f269b00492a42..d43455cf76287 100644 --- a/paddle/cinn/backends/compiler.h +++ b/paddle/cinn/backends/compiler.h @@ -107,7 +107,10 @@ class Compiler final { /** * Compile and link to a CINN module. */ - void Build(const ir::Module& module, const std::string& code = ""); + void Build(const ir::Module& module, + const std::string& code = "", + const bool end = true); + void AppendCX86(const ir::Module& module); void ExportObject(const std::string& path); @@ -125,9 +128,10 @@ class Compiler final { private: void CompileCudaModule(const ir::Module& module, - const std::string& code = ""); + const std::string& code = "", + bool add_module = true); - void CompileX86Module(const ir::Module& module); + void CompileX86Module(const ir::Module& module, bool add_module = true); explicit Compiler(const Target& target) : target_(target), engine_(ExecutionEngine::Create(ExecutionOptions())) {} diff --git a/paddle/cinn/backends/function_prototype.cc b/paddle/cinn/backends/function_prototype.cc index e413521246b8f..e46b172bf65ed 100644 --- a/paddle/cinn/backends/function_prototype.cc +++ b/paddle/cinn/backends/function_prototype.cc @@ -20,7 +20,7 @@ #include "paddle/cinn/ir/tensor.h" #include "paddle/cinn/runtime/flags.h" - +#include "paddle/common/enforce.h" PD_DECLARE_bool(verbose_function_register); namespace cinn { @@ -42,13 +42,22 @@ bool FunctionProto::Match(const ir::Call *op) const { } void FunctionProto::AssertMatch(const ir::Call *op) const { - CHECK_EQ(name, op->name); - CHECK_EQ(ret_type, op->type()) - << "function proto " << name << " check failed"; - CHECK_EQ(op->read_args.size(), readonly_arg_types.size()) - << "function proto " << name << " check failed"; - CHECK_EQ(op->write_args.size(), mutable_arg_types.size()) - << "function proto " << name << " check failed"; + PADDLE_ENFORCE_EQ( + name, + op->name, + phi::errors::InvalidArgument("function proto's op name check failed")); + PADDLE_ENFORCE_EQ( + ret_type, + op->type(), + phi::errors::InvalidArgument("function proto's op type check failed")); + PADDLE_ENFORCE_EQ(op->read_args.size(), + readonly_arg_types.size(), + phi::errors::InvalidArgument( + "function proto's readonly arg types check failed")); + PADDLE_ENFORCE_EQ(op->write_args.size(), + mutable_arg_types.size(), + phi::errors::InvalidArgument( + "function proto's mutable arg types check failed")); auto get_type = [](Expr u) { if (u.as_tensor() || u.as_buffer()) { @@ -61,14 +70,21 @@ void FunctionProto::AssertMatch(const ir::Call *op) const { if (readonly_arg_types[i] == type_of()) { if (!op->read_args[i].as_tensor()) continue; } else { - CHECK_EQ(get_type(op->read_args[i]), readonly_arg_types[i]); + PADDLE_ENFORCE_EQ( + get_type(op->read_args[i]), + readonly_arg_types[i], + phi::errors::InvalidArgument( + "function proto's readonly arg types check failed")); } } for (int i = 0; i < op->write_args.size(); i++) { if (mutable_arg_types[i] == type_of()) { if (!op->write_args[i].as_tensor()) continue; } else { - CHECK_EQ(get_type(op->write_args[i]), mutable_arg_types[i]); + PADDLE_ENFORCE_EQ(get_type(op->write_args[i]), + mutable_arg_types[i], + phi::errors::InvalidArgument( + "function proto's mutable arg types check failed")); } } } @@ -86,7 +102,10 @@ void FunctionProto::CheckValid() { FunctionProto::shape_inference_t FunctionProto::ShapeFollowNthArgument(int n) { return [=](const std::vector &args, int value_offset) { - CHECK_LT(n, args.size()); + PADDLE_ENFORCE_LT( + n, + args.size(), + phi::errors::InvalidArgument("The argument index is out of range")); auto x = args[n].as_tensor(); CHECK(x); return x->shape; diff --git a/paddle/cinn/backends/ir_schedule_test.cc b/paddle/cinn/backends/ir_schedule_test.cc index 29eae201bbb78..7dd78ddb9cd86 100644 --- a/paddle/cinn/backends/ir_schedule_test.cc +++ b/paddle/cinn/backends/ir_schedule_test.cc @@ -31,7 +31,7 @@ #include "paddle/cinn/optim/remove_schedule_block.h" #include "paddle/cinn/optim/unroll_loops.h" #include "paddle/cinn/optim/vectorize_loops.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace backends { @@ -563,7 +563,10 @@ TEST(IrSchedule, vectorize) { ir::ModuleExpr mod_expr(vec_ast); ir::IRSchedule ir_sch(mod_expr); auto loops = ir_sch.GetLoops("B"); - CHECK_EQ(loops.size(), 2U); + PADDLE_ENFORCE_EQ( + loops.size(), + 2U, + phi::errors::InvalidArgument("The size of loops should be 2.")); ir_sch.Vectorize(loops[1], 16); std::string origin = utils::GetStreamCnt(func[0]); EXPECT_EQ(origin, utils::Trim(R"ROC( @@ -637,7 +640,10 @@ TEST(IrSchedule, unroll) { ir::ModuleExpr mod_expr(vec_ast); ir::IRSchedule ir_sch(mod_expr); auto loops = ir_sch.GetLoops("B"); - CHECK_EQ(loops.size(), 2U); + PADDLE_ENFORCE_EQ( + loops.size(), + 2U, + phi::errors::InvalidArgument("The size of loops should be 2.")); ir_sch.Unroll(loops[1]); std::string origin = utils::GetStreamCnt(func[0]); EXPECT_EQ(origin, utils::Trim(R"ROC( @@ -711,7 +717,10 @@ TEST(IrSchedule, bind) { ir::ModuleExpr mod_expr(vec_ast); ir::IRSchedule ir_sch(mod_expr); auto loops = ir_sch.GetLoops("B"); - CHECK_EQ(loops.size(), 2U); + PADDLE_ENFORCE_EQ( + loops.size(), + 2U, + phi::errors::InvalidArgument("The size of loops should be 2.")); ir_sch.Bind(loops[0], "blockIdx.x"); std::string origin = utils::GetStreamCnt(func[0]); EXPECT_EQ(origin, utils::Trim(R"ROC( @@ -753,7 +762,10 @@ TEST(IrSchedule, simple_compute_at) { auto func = cinn::lang::LowerVec( "test_simple_compute_at", stages, {A, C}, {}, {}, nullptr, target, true); - CHECK_EQ(func.size(), 1U); + PADDLE_ENFORCE_EQ( + func.size(), + 1U, + phi::errors::InvalidArgument("The size of func should be 1.")); auto ast_expr = func[0]->body; std::vector vec_ast{ast_expr}; @@ -826,7 +838,10 @@ TEST(IrSchedule, compute_at0) { auto func = cinn::lang::LowerVec( "test_compute_at0", stages, {A, C}, {}, {}, nullptr, target, true); - CHECK_EQ(func.size(), 1U); + PADDLE_ENFORCE_EQ( + func.size(), + 1U, + phi::errors::InvalidArgument("The size of func should be 1.")); auto ast_expr = func[0]->body; std::vector vec_ast{ast_expr}; @@ -900,7 +915,10 @@ TEST(IrSchedule, compute_at1) { auto func = cinn::lang::LowerVec( "test_compute_at1", stages, {A, C}, {}, {}, nullptr, target, true); - CHECK_EQ(func.size(), 1U); + PADDLE_ENFORCE_EQ( + func.size(), + 1U, + phi::errors::InvalidArgument("The size of func should be 1.")); auto ast_expr = func[0]->body; std::vector vec_ast{ast_expr}; @@ -972,7 +990,10 @@ TEST(IrSchedule, compute_at2) { auto func = cinn::lang::LowerVec( "test_compute_at2", stages, {A, C}, {}, {}, nullptr, target, true); - CHECK_EQ(func.size(), 1U); + PADDLE_ENFORCE_EQ( + func.size(), + 1U, + phi::errors::InvalidArgument("The size of func should be 1.")); auto ast_expr = func[0]->body; std::vector vec_ast{ast_expr}; @@ -1044,7 +1065,10 @@ TEST(IrSchedule, compute_at3) { auto func = cinn::lang::LowerVec( "test_compute_at3", stages, {A, C}, {}, {}, nullptr, target, true); - CHECK_EQ(func.size(), 1U); + PADDLE_ENFORCE_EQ( + func.size(), + 1U, + phi::errors::InvalidArgument("The size of func should be 1.")); auto ast_expr = func[0]->body; std::vector vec_ast{ast_expr}; @@ -1125,7 +1149,10 @@ TEST(IrSchedule, compute_at4) { auto func = cinn::lang::LowerVec( "test_compute_at4", stages, {A, C}, {}, {}, nullptr, target, true); - CHECK_EQ(func.size(), 1U); + PADDLE_ENFORCE_EQ( + func.size(), + 1U, + phi::errors::InvalidArgument("The size of func should be 1.")); auto ast_expr = func[0]->body; std::vector vec_ast{ast_expr}; @@ -1187,7 +1214,10 @@ TEST(IrSchedule, compute_at5) { auto func = cinn::lang::LowerVec( "test_compute_at5", stages, {A, C}, {}, {}, nullptr, target, true); - CHECK_EQ(func.size(), 1U); + PADDLE_ENFORCE_EQ( + func.size(), + 1U, + phi::errors::InvalidArgument("The size of func should be 1.")); auto ast_expr = func[0]->body; std::vector vec_ast{ast_expr}; @@ -1250,7 +1280,10 @@ TEST(IrSchedule, compute_at6) { auto func = cinn::lang::LowerVec( "test_compute_at6", stages, {A, C}, {}, {}, nullptr, target, true); - CHECK_EQ(func.size(), 1U); + PADDLE_ENFORCE_EQ( + func.size(), + 1U, + phi::errors::InvalidArgument("The size of func should be 1.")); auto ast_expr = func[0]->body; std::vector vec_ast{ast_expr}; @@ -1316,7 +1349,10 @@ TEST(IrSchedule, cache_read1) { auto func = cinn::lang::LowerVec( "test_cache_read1", stages, {A, C}, {}, {}, nullptr, target, true); - CHECK_EQ(func.size(), 1U); + PADDLE_ENFORCE_EQ( + func.size(), + 1U, + phi::errors::InvalidArgument("The size of func should be 1.")); auto ast_expr = func[0]->body; std::vector vec_ast{ast_expr}; @@ -1399,7 +1435,10 @@ TEST(IrSchedule, cache_read2) { auto func = cinn::lang::LowerVec( "test_cache_read2", stages, {A, B}, {}, {}, nullptr, target, true); - CHECK_EQ(func.size(), 1U); + PADDLE_ENFORCE_EQ( + func.size(), + 1U, + phi::errors::InvalidArgument("The size of func should be 1.")); auto ast_expr = func[0]->body; std::vector vec_ast{ast_expr}; @@ -1469,7 +1508,10 @@ TEST(IrSchedule, cache_write1) { auto func = cinn::lang::LowerVec( "test_cache_write1", stages, {A, C}, {}, {}, nullptr, target, true); - CHECK_EQ(func.size(), 1U); + PADDLE_ENFORCE_EQ( + func.size(), + 1U, + phi::errors::InvalidArgument("The size of func should be 1.")); auto ast_expr = func[0]->body; std::vector vec_ast{ast_expr}; @@ -1553,7 +1595,10 @@ TEST(IrSchedule, cache_write2) { auto func = cinn::lang::LowerVec( "test_cache_write2", stages, {A, B}, {}, {}, nullptr, target, true); - CHECK_EQ(func.size(), 1U); + PADDLE_ENFORCE_EQ( + func.size(), + 1U, + phi::errors::InvalidArgument("The size of func should be 1.")); auto ast_expr = func[0]->body; std::vector vec_ast{ast_expr}; @@ -1624,7 +1669,10 @@ TEST(IrSchedule, cache_read3) { auto func = cinn::lang::LowerVec( "test_cache_read3", stages, {A, C}, {}, {}, nullptr, target, true); - CHECK_EQ(func.size(), 1U); + PADDLE_ENFORCE_EQ( + func.size(), + 1U, + phi::errors::InvalidArgument("The size of func should be 1.")); auto ast_expr = func[0]->body; std::vector vec_ast{ast_expr}; @@ -1705,7 +1753,10 @@ TEST(IrSchedule, cache_write3) { auto func = cinn::lang::LowerVec( "test_cache_write3", stages, {A, C}, {}, {}, nullptr, target, true); - CHECK_EQ(func.size(), 1U); + PADDLE_ENFORCE_EQ( + func.size(), + 1U, + phi::errors::InvalidArgument("The size of func should be 1.")); auto ast_expr = func[0]->body; std::vector vec_ast{ast_expr}; @@ -1788,7 +1839,10 @@ TEST(IrSchedule, sync_threads) { auto func = cinn::lang::LowerVec( "test_sync_threads", stages, {A, C}, {}, {}, nullptr, target, true); - CHECK_EQ(func.size(), 1U); + PADDLE_ENFORCE_EQ( + func.size(), + 1U, + phi::errors::InvalidArgument("The size of func should be 1.")); auto ast_expr = func[0]->body; std::vector vec_ast{ast_expr}; @@ -1870,7 +1924,10 @@ TEST(IrSchedule, cache_write4) { auto func = cinn::lang::LowerVec( "test_cache_write4", stages, {A, B}, {}, {}, nullptr, target, true); - CHECK_EQ(func.size(), 1U); + PADDLE_ENFORCE_EQ( + func.size(), + 1U, + phi::errors::InvalidArgument("The size of func should be 1.")); auto ast_expr = func[0]->body; std::vector vec_ast{ast_expr}; @@ -1953,7 +2010,10 @@ TEST(IrSchedule, rfactor) { ir::ModuleExpr mod_expr(vec_ast); ir::IRSchedule ir_sch(mod_expr); auto loops = ir_sch.GetLoops("B"); - CHECK_EQ(loops.size(), 3U); + PADDLE_ENFORCE_EQ( + loops.size(), + 3U, + phi::errors::InvalidArgument("The size of loops should be 3.")); auto new_rf_tensor = ir_sch.Rfactor(loops[2], 0); auto* new_rf_tensor_ref = new_rf_tensor.As(); CHECK(new_rf_tensor_ref); @@ -2080,7 +2140,10 @@ TEST(IrSchedule, rfactor1) { ir::ModuleExpr mod_expr(vec_ast); ir::IRSchedule ir_sch(mod_expr); auto loops = ir_sch.GetLoops("B"); - CHECK_EQ(loops.size(), 3U); + PADDLE_ENFORCE_EQ( + loops.size(), + 3U, + phi::errors::InvalidArgument("The size of loops should be 3.")); auto new_rf_tensor = ir_sch.Rfactor(loops[1], 1); auto* new_rf_tensor_ref = new_rf_tensor.As(); CHECK(new_rf_tensor_ref); @@ -2206,7 +2269,10 @@ TEST(IrSchedule, rfactor2) { ir::ModuleExpr mod_expr(vec_ast); ir::IRSchedule ir_sch(mod_expr); auto loops = ir_sch.GetLoops("C"); - CHECK_EQ(loops.size(), 3U); + PADDLE_ENFORCE_EQ( + loops.size(), + 3U, + phi::errors::InvalidArgument("The size of loops should be 3.")); auto new_rf_tensor = ir_sch.Rfactor(loops[2], 0); auto* new_rf_tensor_ref = new_rf_tensor.As(); CHECK(new_rf_tensor_ref); @@ -2347,7 +2413,10 @@ TEST(IrSchedule, factorize_reduction) { ir::ModuleExpr mod_expr(vec_ast); ir::IRSchedule ir_sch(mod_expr); auto loops = ir_sch.GetLoops("B"); - CHECK_EQ(loops.size(), 3U); + PADDLE_ENFORCE_EQ( + loops.size(), + 3U, + phi::errors::InvalidArgument("The size of loops should be 3.")); auto new_rf_tensor = ir_sch.FactorizeReduction(loops[1], 0); auto* new_rf_tensor_ref = new_rf_tensor.As(); CHECK(new_rf_tensor_ref); @@ -2436,7 +2505,10 @@ TEST(IrSchedule, factorize_reduction1) { ir::ModuleExpr mod_expr(vec_ast); ir::IRSchedule ir_sch(mod_expr); auto loops = ir_sch.GetLoops("B"); - CHECK_EQ(loops.size(), 3U); + PADDLE_ENFORCE_EQ( + loops.size(), + 3U, + phi::errors::InvalidArgument("The size of loops should be 3.")); auto new_rf_tensor = ir_sch.FactorizeReduction(loops[1], 1); auto* new_rf_tensor_ref = new_rf_tensor.As(); CHECK(new_rf_tensor_ref); @@ -2520,9 +2592,15 @@ TEST(IrSchedule, factorize_reduction2) { ir::ModuleExpr mod_expr(vec_ast); ir::IRSchedule ir_sch(mod_expr); auto loops = ir_sch.GetLoops("B"); - CHECK_EQ(loops.size(), 2U); + PADDLE_ENFORCE_EQ( + loops.size(), + 2U, + phi::errors::InvalidArgument("The size of loops should be 2.")); auto splited_loops = ir_sch.Split(loops[1], {4, 5}); - CHECK_EQ(splited_loops.size(), 2U); + PADDLE_ENFORCE_EQ( + splited_loops.size(), + 2U, + phi::errors::InvalidArgument("The size of splited_loops should be 2.")); auto new_rf_tensor = ir_sch.FactorizeReduction(splited_loops[0], 1); auto* new_rf_tensor_ref = new_rf_tensor.As(); CHECK(new_rf_tensor_ref); @@ -3278,13 +3356,19 @@ TEST(IrSchedule, ComplexIndices) { VLOG(3) << "Lowered Expr:" << ir_sch.GetModule().GetExprs().front(); auto loops_b = ir_sch.GetLoops("B"); - CHECK_EQ(loops_b.size(), 2); + PADDLE_ENFORCE_EQ( + loops_b.size(), + 2, + phi::errors::InvalidArgument("The loops size of B should be 2.")); ir_sch.Split("B", 0, {8, -1}); ir_sch.Split( "B", 2, {32, -1}); // after first splited, loops size has added to 3 VLOG(3) << "Splited Expr:" << ir_sch.GetModule().GetExprs().front(); - CHECK_EQ(ir_sch.GetLoops("B").size(), 4); + PADDLE_ENFORCE_EQ(ir_sch.GetLoops("B").size(), + 4, + phi::errors::InvalidArgument( + "The loops size of B should be 4 after split.")); ir_sch.Reorder("B", {2, 0, 3, 1}); VLOG(3) << "Reordered Expr:\n" << ir_sch.GetModule().GetExprs().front(); diff --git a/paddle/cinn/backends/llvm/codegen_llvm.cc b/paddle/cinn/backends/llvm/codegen_llvm.cc index 2f8a387045bf6..d7889ebb9fc15 100644 --- a/paddle/cinn/backends/llvm/codegen_llvm.cc +++ b/paddle/cinn/backends/llvm/codegen_llvm.cc @@ -24,7 +24,6 @@ #include #include #include - #include #include #include @@ -32,6 +31,7 @@ #include #include #include +#include "paddle/common/enforce.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" @@ -205,12 +205,12 @@ llvm::Value *CodeGenLLVM::EmitBinaryOp(llvm::Value *lhs, bool is_integral, bool is_signed) { llvm::Instruction::BinaryOps ops; - CHECK_EQ(lhs->getType(), rhs->getType()) - << "the types of operands of binary operation are mismatch" - << ", lhs[" << DumpToString(*lhs) << "] " << opcode << " rhs[" - << DumpToString(*rhs) << "]" - << ", lhs_type[" << DumpToString(*lhs->getType()) << "], rhs_type[" - << DumpToString(*rhs->getType()) << "]"; + PADDLE_ENFORCE_EQ( + lhs->getType(), + rhs->getType(), + phi::errors::InvalidArgument( + "the types of operands of binary operation are mismatch")); + switch (opcode) { case '+': ops = is_integral ? llvm::Instruction::BinaryOps::Add @@ -288,6 +288,7 @@ llvm::Value *CodeGenLLVM::Visit(const ir::Sub *op) { } llvm::Value *CodeGenLLVM::Visit(const ir::Mul *op) { + ir::TryElevateInt32ToInt64({op->a(), op->b()}); auto *lhs = Visit(&op->a()); auto *rhs = Visit(&op->b()); return EmitBinaryOp(lhs, rhs, '*', is_integral_type(op->type())); @@ -591,8 +592,8 @@ llvm::Value *CodeGenLLVM::CreateSerialFor(const ir::For *op, int stride) { llvm::Value *old_var = GetVar(op->loop_var->name); // loop iterator - llvm::AllocaInst *loop_var = - Alloca(b_->getInt32Ty(), nullptr, op->loop_var->name); + llvm::AllocaInst *loop_var = Alloca( + b_->getIntNTy(op->min->type().bits()), nullptr, op->loop_var->name); loop_var->setAlignment(llvm::Align(4)); SetVar(op->loop_var->name, loop_var); @@ -613,7 +614,8 @@ llvm::Value *CodeGenLLVM::CreateSerialFor(const ir::For *op, int stride) { // loop_body b_->SetInsertPoint(body_bb); - llvm::Value *step = llvm::ConstantInt::get(b_->getInt32Ty(), stride); + llvm::Value *step = + llvm::ConstantInt::get(b_->getIntNTy(op->min->type().bits()), stride); Visit(&op->body); llvm::Value *indvar_inc = Add(indvar, @@ -880,7 +882,10 @@ llvm::Value *CodeGenLLVM::Visit(const ir::Load *op) { { int alignment = op->type().bits(); alignment = 8; - CHECK_GT(alignment, 0); + PADDLE_ENFORCE_GT( + alignment, + 0, + phi::errors::InvalidArgument("alignment should be greater than 0")); load_inst->setAlignment(llvm::Align(std::min(alignment, 8))); } @@ -949,7 +954,10 @@ llvm::Value *CodeGenLLVM::Visit(const ir::Store *op) { { int alignment = op->type().bits(); alignment = 8; - CHECK_GT(alignment, 0); + PADDLE_ENFORCE_GT( + alignment, + 0, + phi::errors::InvalidArgument("alignment should be greater than 0")); store_inst->setAlignment(llvm::Align(std::min(alignment, 8))); } // TODO(fc500110): tbaa AliasAnalysis @@ -1059,9 +1067,12 @@ llvm::Value *CodeGenLLVM::Visit(const ir::_LoweredFunc_ *op) { auto init_function_state = [this]() { alias_vars_.clear(); }; init_function_state(); - CHECK_EQ(op->alloc_output_buffer_exprs.size(), - op->dealloc_output_buffer_exprs.size()) - << "the count of allocation and deallocation expressions is not match"; + PADDLE_ENFORCE_EQ( + op->alloc_output_buffer_exprs.size(), + op->dealloc_output_buffer_exprs.size(), + phi::errors::InvalidArgument( + "the count of allocation and deallocation expressions is not " + "match")); std::vector new_body; auto create_temp_buffers = op->PrepareCreateTempBufferExprs(); @@ -1228,7 +1239,11 @@ llvm::Value *CodeGenLLVM::EmitCall_get_address(const ir::Call *op) { llvm::Value *CodeGenLLVM::EmitCall_debug_info(const ir::Call *op) { auto callee = m_->getFunction(runtime::intrinsic::debug_log_repr); - CHECK_GE(op->read_args.size(), 1UL); + PADDLE_ENFORCE_GE(op->read_args.size(), + 1UL, + phi::errors::InvalidArgument( + "The arguments of debug_log_repr should be greater " + "than 1")); std::vector args; for (auto &arg : op->read_args) { args.push_back(Visit(&arg)); @@ -1315,7 +1330,9 @@ llvm::Value *CodeGenLLVM::DenseVectorLoad(const ir::Load *op) { slices.push_back(load_inst); } - CHECK_EQ(slices.size(), 1UL); + PADDLE_ENFORCE_EQ(slices.size(), + 1UL, + phi::errors::InvalidArgument("slices size should be 1.")); return slices[0]; } @@ -1323,7 +1340,11 @@ llvm::Value *CodeGenLLVM::DenseVectorLoad(const ir::Load *op) { llvm::Value *CodeGenLLVM::CreateBufferVecPtr(Type t, llvm::Value *buffer, llvm::Value *index) { - CHECK_GT(t.lanes(), 1) << "type is not a vector type: " << t; + PADDLE_ENFORCE_GT(t.lanes(), + 1, + phi::errors::InvalidArgument("type lanes should be greater " + "than 1, but received %d", + t.lanes())); llvm::PointerType *btype = llvm::dyn_cast(buffer->getType()); CHECK(btype); @@ -1338,7 +1359,11 @@ llvm::Value *CodeGenLLVM::CreateBufferVecPtr(Type t, llvm::Value *CodeGenLLVM::CreateBufferPtr(Type t, llvm::Value *buffer, llvm::Value *index) { - CHECK_EQ(t.lanes(), 1); + PADDLE_ENFORCE_EQ(t.lanes(), + 1, + phi::errors::InvalidArgument("type lanes should be 1, but " + "received %d", + t.lanes())); auto *btype = llvm::dyn_cast(buffer->getType()); CHECK(btype); auto *ptype = @@ -1355,7 +1380,10 @@ llvm::Value *CodeGenLLVM::CreateVecSlice(llvm::Value *vec, int lanes) { int total_lanes = llvm::dyn_cast(vec->getType())->getNumElements(); - CHECK_LE(begin + lanes, total_lanes); + PADDLE_ENFORCE_LE(begin + lanes, + total_lanes, + phi::errors::InvalidArgument( + "begin + lanes should be less than total_lanes")); if (lanes == total_lanes && begin == 0) return vec; // full slice std::vector indices; for (int i = 0; i < lanes; ++i) { @@ -1422,7 +1450,10 @@ void CodeGenLLVM::AddTbaaMetadata(llvm::Instruction *inst, if (pstride_int && pbase_int) { int stride = pstride_int->value; base = pbase_int->value; - CHECK_GE(base, 0); + PADDLE_ENFORCE_GE( + base, + 0, + phi::errors::InvalidArgument("base should be greater than 0")); width = NextPowerOfTwo(ramp->lanes * stride); while (base % width) { @@ -1491,12 +1522,15 @@ llvm::Value *CodeGenLLVM::Visit(const ir::intrinsics::BufferCreate *op) { CHECK(buffer_node); std::vector args( {ll_const_int32(buffer_node->target.runtime_arch())}); - uint64_t memory_size = (buffer_node->dtype.ElementOf().bits() + 7) / 8; - for (auto shape : buffer_node->shape) { - int shape_int = shape.as_int32(); - memory_size *= shape_int; + int64_t memory_size = (buffer_node->dtype.ElementOf().bits() + 7) / 8; + // Calculate buffer size and determine if it contains a symbolic constant + Expr buffer_size(static_cast(1)); + buffer_size = buffer_size * ir::Expr(memory_size); + for (int i = 0; i < buffer_node->shape.size(); i++) { + buffer_size = buffer_size * buffer_node->shape[i]; } - args.push_back(ll_const_int64(memory_size)); + ir::TryElevateInt32ToInt64({buffer_size}); + args.push_back(Visit(&buffer_size)); args.push_back(ll_const_int32(32)); return Call(callee, args); @@ -1596,29 +1630,50 @@ llvm::Value *CodeGenLLVM::Visit(const ir::intrinsics::BuiltinIntrin *op) { std::string func_name = op->name; if (op->id == -1) { if (func_name == "bitwise_and") { - CHECK_GE(op->args.size(), 2U); + PADDLE_ENFORCE_GE(op->args.size(), + 2U, + phi::errors::InvalidArgument( + "bitwise_and should have at least 2 arguments")); return b_->CreateAnd(Visit(&op->args[0]), Visit(&op->args[1])); } else if (func_name == "bitwise_or") { - CHECK_GE(op->args.size(), 2U); + PADDLE_ENFORCE_GE(op->args.size(), + 2U, + phi::errors::InvalidArgument( + "bitwise_or should have at least 2 arguments")); return b_->CreateOr(Visit(&op->args[0]), Visit(&op->args[1])); } else if (func_name == "bitwise_xor") { - CHECK_GE(op->args.size(), 2U); + PADDLE_ENFORCE_GE(op->args.size(), + 2U, + phi::errors::InvalidArgument( + "bitwise_xor should have at least 2 arguments")); return b_->CreateXor(Visit(&op->args[0]), Visit(&op->args[1])); } else if (func_name == "bitwise_not") { - CHECK_GE(op->args.size(), 1U); + PADDLE_ENFORCE_GE(op->args.size(), + 1U, + phi::errors::InvalidArgument( + "bitwise_not should have at least 1 argument")); return b_->CreateNot(Visit(&op->args[0])); } else if (func_name == "left_shift") { - CHECK_GE(op->args.size(), 2U); + PADDLE_ENFORCE_GE(op->args.size(), + 2U, + phi::errors::InvalidArgument( + "left_shift should have at least 2 arguments")); return b_->CreateShl(Visit(&op->args[0]), Visit(&op->args[1])); } else if (func_name == "right_shift") { - CHECK_GE(op->args.size(), 2U); + PADDLE_ENFORCE_GE(op->args.size(), + 2U, + phi::errors::InvalidArgument( + "right_shift should have at least 2 arguments")); if (op->args[0]->type().is_int()) { return b_->CreateAShr(Visit(&op->args[0]), Visit(&op->args[1])); } else { return b_->CreateLShr(Visit(&op->args[0]), Visit(&op->args[1])); } } else if (func_name == "isnan") { - CHECK_GE(op->args.size(), 1U); + PADDLE_ENFORCE_GE(op->args.size(), + 1U, + phi::errors::InvalidArgument( + "isnan should have at least 1 argument")); llvm::Value *v = Visit(&op->args[0]); return b_->CreateFCmpUNO(v, v); } diff --git a/paddle/cinn/backends/llvm/codegen_llvm_test.cc b/paddle/cinn/backends/llvm/codegen_llvm_test.cc index 930e70f22e869..074e960aba678 100644 --- a/paddle/cinn/backends/llvm/codegen_llvm_test.cc +++ b/paddle/cinn/backends/llvm/codegen_llvm_test.cc @@ -21,12 +21,12 @@ #include #include #include - #include #include #include #include #include +#include "paddle/common/enforce.h" #include "paddle/cinn/backends/llvm/cinn_runtime_llvm_ir.h" #include "paddle/cinn/cinn.h" @@ -96,7 +96,10 @@ auto CreateIrBuffer(cinn::common::Type t, std::string name, std::vector shape, int data_alignment = 0) { - CHECK_GE(data_alignment, 0); + PADDLE_ENFORCE_GE(data_alignment, + 0, + phi::errors::InvalidArgument( + "data_alignment should be greater than or equal to 0")); auto buffer = ir::_Buffer_::Make(std::move(name), std::move(t)); if (data_alignment) { diff --git a/paddle/cinn/backends/llvm/codegen_x86.cc b/paddle/cinn/backends/llvm/codegen_x86.cc index cfd796162241c..5987e3af7a7c3 100644 --- a/paddle/cinn/backends/llvm/codegen_x86.cc +++ b/paddle/cinn/backends/llvm/codegen_x86.cc @@ -30,7 +30,7 @@ #include "paddle/cinn/ir/op/ir_operators.h" #include "paddle/cinn/ir/utils/ir_nodes_collector.h" #include "paddle/cinn/runtime/intrinsic.h" - +#include "paddle/common/enforce.h" namespace cinn::backends { CodeGenX86::CodeGenX86(llvm::Module* m, @@ -144,8 +144,10 @@ void CodeGenX86::CreateParallelLaunch(Expr body, int num_task) { symbol_table_->PopScope(); std::swap(parallel_env_, par_env); std::swap(f_, f); - CHECK_NE(par_env.parallel_loop_count, 0) - << "find no parallel loop within parallel launch"; + PADDLE_ENFORCE_NE(par_env.parallel_loop_count, + 0, + phi::errors::InvalidArgument( + "find no parallel loop within parallel launch")); b_->SetInsertPoint(launch_end); } diff --git a/paddle/cinn/backends/llvm/execution_engine.cc b/paddle/cinn/backends/llvm/execution_engine.cc index 050fd4e0d8389..8a84d69a1d7a0 100644 --- a/paddle/cinn/backends/llvm/execution_engine.cc +++ b/paddle/cinn/backends/llvm/execution_engine.cc @@ -166,17 +166,20 @@ std::unique_ptr NaiveObjectCache::getObject( VLOG(2) << "===================== Create CINN ExecutionEngine end " "===================="; + engine->ctx = std::make_unique(); + engine->b = std::make_unique>(*engine->ctx); + llvm::SMDiagnostic error; + engine->m = llvm::parseAssemblyString( + AsStringRef(backends::kRuntimeLlvmIr), error, *engine->ctx); + return engine; } template -void ExecutionEngine::Link(const ir::Module &module) { +void ExecutionEngine::Link(const ir::Module &module, bool add_module) { utils::RecordEvent("ExecutionEngine Link", utils::EventType::kOrdinary); llvm::SMDiagnostic error; - auto ctx = std::make_unique(); - auto m = llvm::parseAssemblyString( - AsStringRef(backends::kRuntimeLlvmIr), error, *ctx); - auto b = std::make_unique>(*ctx); + auto ir_emitter = std::make_unique(m.get(), b.get()); VLOG(3) << "ir_emitter->Compile(module) Begin"; ir_emitter->Compile(module); @@ -200,7 +203,9 @@ void ExecutionEngine::Link(const ir::Module &module) { pass_manager, rawstream, nullptr, llvm::CGFT_ObjectFile); pass_manager.run(*m); - CHECK(AddModule(std::move(m), std::move(ctx))); + if (add_module) { + AddSelfModule(); + } if (VLOG_IS_ON(5)) { VLOG(5) << "======= dump jit execution session ======"; @@ -231,6 +236,9 @@ bool ExecutionEngine::AddModule(std::unique_ptr module, llvm::cantFail(jit_->addIRModule(std::move(tsm))); return true; } +bool ExecutionEngine::AddSelfModule() { + return AddModule(std::move(m), std::move(ctx)); +} void ExecutionEngine::ExportObject(const std::string &path) { FILE *of = fopen(path.c_str(), "w"); @@ -268,8 +276,11 @@ void ExecutionEngine::RegisterRuntimeSymbols() { } } -template void ExecutionEngine::Link(const ir::Module &module); -template void ExecutionEngine::Link(const ir::Module &module); -template void ExecutionEngine::Link(const ir::Module &module); +template void ExecutionEngine::Link(const ir::Module &module, + bool add_module); +template void ExecutionEngine::Link(const ir::Module &module, + bool add_module); +template void ExecutionEngine::Link(const ir::Module &module, + bool add_module); } // namespace cinn::backends diff --git a/paddle/cinn/backends/llvm/execution_engine.h b/paddle/cinn/backends/llvm/execution_engine.h index 63f9427a53edb..44b212f245f90 100644 --- a/paddle/cinn/backends/llvm/execution_engine.h +++ b/paddle/cinn/backends/llvm/execution_engine.h @@ -79,18 +79,22 @@ class ExecutionEngine { void *Lookup(absl::string_view name); template - void Link(const ir::Module &module); + void Link(const ir::Module &module, bool add_module = true); void ExportObject(const std::string &path); bool AddModule(std::unique_ptr module, std::unique_ptr context); + bool AddSelfModule(); + protected: explicit ExecutionEngine(bool enable_object_cache, RuntimeSymbols &&module_symbols) : cache_(std::make_unique()), - module_symbols_(std::move(module_symbols)) {} + module_symbols_(std::move(module_symbols)), + ctx(std::make_unique()), + b(std::make_unique>(*ctx)) {} void RegisterRuntimeSymbols(); @@ -106,6 +110,10 @@ class ExecutionEngine { std::unique_ptr jit_; std::unique_ptr cache_; RuntimeSymbols module_symbols_; + + std::unique_ptr ctx; + std::unique_ptr m; + std::unique_ptr> b; }; } // namespace cinn::backends diff --git a/paddle/cinn/backends/llvm/execution_engine_test.cc b/paddle/cinn/backends/llvm/execution_engine_test.cc index a13f329a81259..beb3ec61fae25 100644 --- a/paddle/cinn/backends/llvm/execution_engine_test.cc +++ b/paddle/cinn/backends/llvm/execution_engine_test.cc @@ -26,7 +26,6 @@ #include #include #include - #include #include #include @@ -35,6 +34,7 @@ #include #include #include +#include "paddle/common/enforce.h" #include "paddle/cinn/backends/llvm/cinn_runtime_llvm_ir.h" #include "paddle/cinn/backends/llvm/codegen_llvm.h" @@ -91,7 +91,11 @@ auto CreateTestBuffer() { } float *Cd = reinterpret_cast(C->memory); - CHECK_EQ(C->num_elements(), A->num_elements()); + PADDLE_ENFORCE_EQ( + C->num_elements(), + A->num_elements(), + phi::errors::InvalidArgument( + "The number of elements of C and A should be the same.")); return std::make_tuple(A, B, C); } diff --git a/paddle/cinn/backends/llvm/llvm_intrin_rule.h b/paddle/cinn/backends/llvm/llvm_intrin_rule.h index 903c056196f4e..14e3718299c0f 100644 --- a/paddle/cinn/backends/llvm/llvm_intrin_rule.h +++ b/paddle/cinn/backends/llvm/llvm_intrin_rule.h @@ -26,17 +26,24 @@ #include "paddle/cinn/ir/intrinsic_ops.h" #include "paddle/cinn/ir/registry.h" #include "paddle/cinn/lang/packed_func.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace codegen { template inline void MakeFloatIntrinOp(lang::Args args, lang::RetValue *rv) { - CHECK_GE(args.size(), 1U); + PADDLE_ENFORCE_GE(args.size(), + 1U, + phi::errors::InvalidArgument( + "The number of args should be greater than 1.")); Expr arg = args[0]; ir::Call *node = arg->as(); CHECK(node); - CHECK_GE(node->read_args.size(), arg_nums); + PADDLE_ENFORCE_GE( + node->read_args.size(), + arg_nums, + phi::errors::InvalidArgument( + "The number of read args should be greater than arg_nums.")); if (add_float_suffix) { CHECK(node->type().is_float()); *rv = ir::intrinsics::BuiltinIntrin::Make( @@ -85,7 +92,10 @@ void RegisterCpuIntrinRule() { ir::Registry::Register("lower_cpu_intrinsic_isfinite", true) .SetBody([](lang::Args args, lang::RetValue *rv) { - CHECK_GE(args.size(), 1U); + PADDLE_ENFORCE_GE(args.size(), + 1U, + phi::errors::InvalidArgument( + "The number of args should be greater than 1.")); Expr arg0 = args[0]; ir::Call *node = arg0->as(); CHECK(node); @@ -96,7 +106,10 @@ void RegisterCpuIntrinRule() { ir::Registry::Register("lower_cpu_intrinsic_isinf", true) .SetBody([](lang::Args args, lang::RetValue *rv) { - CHECK_GE(args.size(), 1U); + PADDLE_ENFORCE_GE(args.size(), + 1U, + phi::errors::InvalidArgument( + "The number of args should be greater than 1.")); Expr arg0 = args[0]; ir::Call *node = arg0->as(); CHECK(node); @@ -113,7 +126,10 @@ void RegisterCpuIntrinRule() { ir::Registry::Register("lower_cpu_intrinsic_rsqrt", true) .SetBody([](lang::Args args, lang::RetValue *rv) { - CHECK_GE(args.size(), 1U); + PADDLE_ENFORCE_GE(args.size(), + 1U, + phi::errors::InvalidArgument( + "The number of args should be greater than 1.")); Expr arg0 = args[0]; ir::Call *node = arg0->as(); CHECK(node); @@ -124,7 +140,10 @@ void RegisterCpuIntrinRule() { ir::Registry::Register("lower_cpu_intrinsic_exp10", true) .SetBody([](lang::Args args, lang::RetValue *rv) { - CHECK_GE(args.size(), 1U); + PADDLE_ENFORCE_GE(args.size(), + 1U, + phi::errors::InvalidArgument( + "The number of args should be greater than 1.")); Expr arg0 = args[0]; ir::Call *node = arg0->as(); CHECK(node); @@ -136,7 +155,10 @@ void RegisterCpuIntrinRule() { ir::Registry::Register("lower_cpu_intrinsic_tan", true) .SetBody([](lang::Args args, lang::RetValue *rv) { - CHECK_GE(args.size(), 1U); + PADDLE_ENFORCE_GE(args.size(), + 1U, + phi::errors::InvalidArgument( + "The number of args should be greater than 1.")); Expr arg0 = args[0]; ir::Call *node = arg0->as(); CHECK(node); @@ -147,7 +169,10 @@ void RegisterCpuIntrinRule() { ir::Registry::Register("lower_cpu_intrinsic_tanh", true) .SetBody([](lang::Args args, lang::RetValue *rv) { - CHECK_GE(args.size(), 1U); + PADDLE_ENFORCE_GE(args.size(), + 1U, + phi::errors::InvalidArgument( + "The number of args should be greater than 1.")); Expr arg0 = args[0]; ir::Call *node = arg0->as(); CHECK(node); @@ -168,7 +193,10 @@ void RegisterCpuIntrinRule() { ir::Registry::Register("lower_cpu_intrinsic_cosh", true) .SetBody([](lang::Args args, lang::RetValue *rv) { - CHECK_GE(args.size(), 1U); + PADDLE_ENFORCE_GE(args.size(), + 1U, + phi::errors::InvalidArgument( + "The number of args should be greater than 1.")); Expr arg0 = args[0]; ir::Call *node = arg0->as(); CHECK(node); @@ -180,7 +208,10 @@ void RegisterCpuIntrinRule() { ir::Registry::Register("lower_cpu_intrinsic_sinh", true) .SetBody([](lang::Args args, lang::RetValue *rv) { - CHECK_GE(args.size(), 1U); + PADDLE_ENFORCE_GE(args.size(), + 1U, + phi::errors::InvalidArgument( + "The number of args should be greater than 1.")); Expr arg0 = args[0]; ir::Call *node = arg0->as(); CHECK(node); diff --git a/paddle/cinn/backends/llvm/llvm_optimizer.cc b/paddle/cinn/backends/llvm/llvm_optimizer.cc index e64fb9f42ee0b..22f9a37351664 100644 --- a/paddle/cinn/backends/llvm/llvm_optimizer.cc +++ b/paddle/cinn/backends/llvm/llvm_optimizer.cc @@ -74,12 +74,12 @@ class CustomPassManager : public PassManagerT { void add(llvm::Pass *pass) override { if (print_passes_) { if (is_function_pass_manager_) { - VLOG(1) << "llvm run function pass[" << std::string(pass->getPassName()) + VLOG(4) << "llvm run function pass[" << std::string(pass->getPassName()) << "]"; } if (is_module_pass_manager_) { - VLOG(1) << "llvm run module pass[" << std::string(pass->getPassName()) + VLOG(4) << "llvm run module pass[" << std::string(pass->getPassName()) << "]"; } } diff --git a/paddle/cinn/backends/llvm/runtime_symbol_registry.cc b/paddle/cinn/backends/llvm/runtime_symbol_registry.cc index 3885ebe0c4199..52dbe7f024307 100644 --- a/paddle/cinn/backends/llvm/runtime_symbol_registry.cc +++ b/paddle/cinn/backends/llvm/runtime_symbol_registry.cc @@ -20,8 +20,8 @@ #include #include "paddle/cinn/runtime/flags.h" +#include "paddle/common/enforce.h" #include "paddle/common/flags.h" - PD_DECLARE_bool(verbose_function_register); namespace cinn { @@ -51,8 +51,10 @@ void RuntimeSymbols::Register(const std::string &name, void *address) { std::lock_guard lock(mu_); auto it = symbols_.find(name); if (it != symbols_.end()) { - CHECK_EQ(it->second, address) - << "Duplicate register symbol [" << name << "]"; + PADDLE_ENFORCE_EQ( + it->second, + address, + phi::errors::InvalidArgument("Duplicate register symbol")); return; } diff --git a/paddle/cinn/backends/modular.cc b/paddle/cinn/backends/modular.cc index fb736154c7bfc..f735b8b6da56a 100644 --- a/paddle/cinn/backends/modular.cc +++ b/paddle/cinn/backends/modular.cc @@ -15,7 +15,7 @@ #include "paddle/cinn/backends/modular.h" #include "paddle/cinn/ir/ir_visitor.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace backends { @@ -104,8 +104,14 @@ class ModularEvaluator : public ir::IRVisitorRequireReImpl { } static int gcd(int a, int b) { - CHECK_GE(a, 0); - CHECK_GE(b, 0); + PADDLE_ENFORCE_GE( + a, + 0, + phi::errors::InvalidArgument("a should be greater than or equal to 0")); + PADDLE_ENFORCE_GE( + b, + 0, + phi::errors::InvalidArgument("b should be greater than or equal to 0")); if (a < b) std::swap(a, b); if (b == 0) return a; diff --git a/paddle/cinn/backends/nvrtc/header_generator.cc b/paddle/cinn/backends/nvrtc/header_generator.cc index d4b2b9504673f..7d88ed16d0413 100644 --- a/paddle/cinn/backends/nvrtc/header_generator.cc +++ b/paddle/cinn/backends/nvrtc/header_generator.cc @@ -16,7 +16,7 @@ #include "glog/logging.h" #include "jitify.hpp" // NOLINT - +#include "paddle/common/enforce.h" namespace cinn { namespace backends { namespace nvrtc { @@ -27,8 +27,10 @@ HeaderGeneratorBase& JitSafeHeaderGenerator::GetInstance() { } const size_t JitSafeHeaderGenerator::size() const { - CHECK_EQ(include_names_.size(), headers_.size()) - << "Internal error in size of header files."; + PADDLE_ENFORCE_EQ( + include_names_.size(), + headers_.size(), + phi::errors::InvalidArgument("Internal error in size of header files.")); return include_names_.size(); } diff --git a/paddle/cinn/backends/nvrtc/nvrtc_util.cc b/paddle/cinn/backends/nvrtc/nvrtc_util.cc index 737d887ea809c..1b887268a1ae8 100644 --- a/paddle/cinn/backends/nvrtc/nvrtc_util.cc +++ b/paddle/cinn/backends/nvrtc/nvrtc_util.cc @@ -29,7 +29,7 @@ #include "paddle/cinn/common/common.h" #include "paddle/cinn/runtime/flags.h" #include "paddle/cinn/utils/string.h" - +#include "paddle/common/enforce.h" PD_DECLARE_string(cinn_nvcc_cmd_path); PD_DECLARE_string(nvidia_package_dir); PD_DECLARE_bool(nvrtc_compile_to_cubin); @@ -187,7 +187,9 @@ std::string Compiler::CompileCudaSource(const std::string& code, std::string log; log.resize(log_size); NVRTC_CALL(nvrtcGetProgramLog(prog, &log[0])); - CHECK_EQ(compile_res, NVRTC_SUCCESS) << log << "\nThe code is:\n" << code; + PADDLE_ENFORCE_EQ(compile_res, + NVRTC_SUCCESS, + phi::errors::Fatal("NVRTC compilation failed")); } size_t size; diff --git a/paddle/cinn/common/cas.h b/paddle/cinn/common/cas.h index 7fbd0bfe6aa00..2d796c639406f 100755 --- a/paddle/cinn/common/cas.h +++ b/paddle/cinn/common/cas.h @@ -51,12 +51,12 @@ struct CasInterval { * 1 <= iterator_i <= 5 */ CasInterval(Expr expr_l, Expr expr_r) { - VLOG(2) << "CasInterval is : [" << expr_l << ", " << expr_r << "]."; + VLOG(6) << "CasInterval is : [" << expr_l << ", " << expr_r << "]."; expr_r = detail::ReplaceMinToConstant(expr_r); expr_l = detail::ReplaceMaxToConstant(expr_l); optim::Simplify(&expr_l); optim::Simplify(&expr_r); - VLOG(2) << "After simplify, CasInterval is : [" << expr_l << ", " << expr_r + VLOG(6) << "After simplify, CasInterval is : [" << expr_l << ", " << expr_r << "]."; if (expr_l.is_constant() && expr_r.is_constant()) { diff --git a/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc b/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc index 36fe9e340fcd9..5e7d3e6d876cf 100644 --- a/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc +++ b/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc @@ -324,12 +324,12 @@ void SplitOp::Build(pir::Builder& builder, // NOLINT const char* GenerateShapeOp::attributes_name[attributes_num] = { "output_dim_exprs", "symbol_bindings"}; -void GenerateShapeOp::Build( - pir::Builder& builder, - pir::OperationArgument& argument, - const std::vector& inputs, - const std::vector& output_dim_exprs, - const GenerateShapeOp::SymbolBindings& symbol_bindings) { +void GenerateShapeOp::Build(pir::Builder& builder, + pir::OperationArgument& argument, + const std::vector& inputs, + const std::vector& output_dim_exprs, + const SymbolBindings& symbol_bindings, + const pir::Type& output_type) { if (inputs.empty()) { VLOG(3) << "GenerateShapeOp inputs is empty"; for (const auto& attr : output_dim_exprs) { @@ -344,13 +344,7 @@ void GenerateShapeOp::Build( argument.AddAttribute( "symbol_bindings", ConvertSymbolBindingsToAttribute(builder, symbol_bindings)); - argument.AddOutputs({[&]() { - auto* ctx = pir::IrContext::Instance(); - auto type = pir::Int64Type::get(ctx); - auto dim = - ::common::make_ddim({static_cast(output_dim_exprs.size())}); - return DenseTensorType::get(ctx, type, dim); - }()}); + argument.AddOutput(output_type); ::pir::PassStopGradientsDefaultly(argument); } diff --git a/paddle/cinn/hlir/dialect/operator/ir/manual_op.h b/paddle/cinn/hlir/dialect/operator/ir/manual_op.h index 1eddfaffd0df1..06f306a0e3623 100644 --- a/paddle/cinn/hlir/dialect/operator/ir/manual_op.h +++ b/paddle/cinn/hlir/dialect/operator/ir/manual_op.h @@ -168,7 +168,8 @@ class IR_API GenerateShapeOp pir::OperationArgument &argument, // NOLINT const std::vector &inputs, const std::vector &output_dim_exprs, - const SymbolBindings &symbol_bindings); + const SymbolBindings &symbol_bindings, + const pir::Type &output_type); void VerifySig() {} diff --git a/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_pass.cc index 63d5b519ce887..ec82d41742a70 100644 --- a/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_pass.cc +++ b/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_pass.cc @@ -232,7 +232,7 @@ class BlockDimExprsAsserter { }; std::vector input_tensors{}; std::vector output_dim_expr_attrs{}; - GenerateShapeOp::SymbolBindings symbol_bindings{}; + SymbolBindings symbol_bindings{}; bool success = MakeGenerateShapeOpAttribute(ir_ctx_, LocalDimExprs4Value, @@ -242,14 +242,13 @@ class BlockDimExprsAsserter { &output_dim_expr_attrs, &symbol_bindings); if (!success) return std::nullopt; - auto out_shape_value = - builder_ - .Build( - input_tensors, output_dim_expr_attrs, symbol_bindings) - .out(); + auto out_type = paddle::dialect::DenseTensorType::get( + builder_.ir_context(), + pir::Int64Type::get(builder_.ir_context()), + ::common::make_ddim({dim_exprs.size()})); return builder_ .Build( - input_tensors, output_dim_expr_attrs, symbol_bindings) + input_tensors, output_dim_expr_attrs, symbol_bindings, out_type) .out(); } @@ -298,8 +297,11 @@ class BlockDimExprsAsserter { PADDLE_ENFORCE_EQ(lhs_numel, rhs_numel, ::common::errors::InvalidArgument( + "Check [%s id:%d] infer symbolic shape failed." "The numel of lhs and rhs must be equal, but " "received lhs's numel is [%d], rhs's numel is [%d]", + op->name(), + op->id(), lhs_numel, rhs_numel)); @@ -326,8 +328,8 @@ class BlockDimExprsAsserter { .out(); auto assert_op = builder_.Build( all_eq, assert_data, lhs_numel); - const std::string error_msg = "Check [" + op->name() + "_" + - std::to_string(op->id()) + + const std::string error_msg = "Check [" + op->name() + + " id:" + std::to_string(op->id()) + "] infer symbolic shape failed."; assert_op->set_attribute( paddle::dialect::AssertOp::ERROR_INFO_ATTR_NAME, diff --git a/paddle/cinn/hlir/dialect/operator/transforms/cinn_to_pd_util.cc b/paddle/cinn/hlir/dialect/operator/transforms/cinn_to_pd_util.cc index 6281baeadbef2..ca422c1a593c8 100644 --- a/paddle/cinn/hlir/dialect/operator/transforms/cinn_to_pd_util.cc +++ b/paddle/cinn/hlir/dialect/operator/transforms/cinn_to_pd_util.cc @@ -190,6 +190,15 @@ ::pir::Operation* ConvertConcatOp(::pir::Operation* op, return pd_op; } +::pir::Operation* ConvertGenerateShapeOp( + ::pir::Operation* op, + ::pir::IrMapping& ir_mapping, // NOLINT + ::pir::Builder& builder) { // NOLINT + auto* new_op = op->Clone(ir_mapping, {true, true, true}); + builder.Insert(new_op); + return new_op; +} + ::pir::Operation* ConvertScaleOp(::pir::Operation* op, ::pir::IrMapping& ir_mapping, // NOLINT ::pir::PatternRewriter& rewriter) { // NOLINT @@ -404,6 +413,9 @@ REGISTER_TRANSFORM_RULES(concat_op, cinn::dialect::ConcatOp::name(), cinn::dialect::details::ConvertConcatOp); +REGISTER_TRANSFORM_RULES(generate_shape_op, + cinn::dialect::GenerateShapeOp::name(), + cinn::dialect::details::ConvertGenerateShapeOp); REGISTER_TRANSFORM_RULES(scale_op, cinn::dialect::ScaleOp::name(), cinn::dialect::details::ConvertScaleOp); diff --git a/paddle/cinn/hlir/dialect/operator/transforms/dynamic_reshape_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/dynamic_reshape_pass.cc index 17317924fb07e..0ffd284ac79f7 100644 --- a/paddle/cinn/hlir/dialect/operator/transforms/dynamic_reshape_pass.cc +++ b/paddle/cinn/hlir/dialect/operator/transforms/dynamic_reshape_pass.cc @@ -38,9 +38,10 @@ bool ReplaceOpWithReshapeOp(pir::Operation* op, std::vector output_dim_expr_attrs{}; GenerateShapeOp::SymbolBindings symbol_bindings{}; - unsigned output_dim_idx = 0, input_dim_idx = 0; int64_t local_dim_expr_id = 0; - for (; output_dim_idx < output_shape.size(); ++output_dim_idx) { + for (unsigned output_dim_idx = 0, input_dim_idx = 0; + output_dim_idx < output_shape.size(); + ++output_dim_idx) { const auto& dim_expr = output_shape.at(output_dim_idx); if (dim_expr.isa()) { output_dim_expr_attrs.emplace_back( @@ -64,8 +65,16 @@ bool ReplaceOpWithReshapeOp(pir::Operation* op, } } } + auto out_type = paddle::dialect::DenseTensorType::get( + rewriter.ir_context(), + pir::Int64Type::get(rewriter.ir_context()), + ::common::make_ddim( + {static_cast(output_dim_expr_attrs.size())})); auto cinn_generate_shape = rewriter.Build( - std::vector{input}, output_dim_expr_attrs, symbol_bindings); + std::vector{input}, + output_dim_expr_attrs, + symbol_bindings, + out_type); auto pd_reshape = rewriter.Build( op->operand_source(0), cinn_generate_shape.result(0)); diff --git a/paddle/cinn/hlir/dialect/operator/transforms/fuse_shape_ops_into_generate_shape_op_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/fuse_shape_ops_into_generate_shape_op_pass.cc index 0578c79b35a2b..473763bb4dcec 100644 --- a/paddle/cinn/hlir/dialect/operator/transforms/fuse_shape_ops_into_generate_shape_op_pass.cc +++ b/paddle/cinn/hlir/dialect/operator/transforms/fuse_shape_ops_into_generate_shape_op_pass.cc @@ -313,9 +313,18 @@ std::optional GetOutOfRewrittenGenerateShapeOp( &output_dim_expr_attrs, &symbol_bindings); if (!success) return std::nullopt; + auto out_type = [&]() -> pir::Type { + if (shape.type().isa()) { + return shape.type(); + } + return paddle::dialect::DenseTensorType::get( + rewriter->ir_context(), + pir::Int64Type::get(rewriter->ir_context()), + ::common::make_ddim({output_dim_expr_attrs.size()})); + }(); return rewriter ->Build( - input_tensors, output_dim_expr_attrs, symbol_bindings) + input_tensors, output_dim_expr_attrs, symbol_bindings, out_type) .out(); } @@ -323,9 +332,8 @@ bool ReplaceShapeOpsToGenerateShape( pir::OpOperand shape_operand, pir::PatternRewriter* rewriter, pir::ShapeConstraintIRAnalysis* shape_analysis) { - if (shape_operand.source() - .defining_op() - ->isa()) { + auto* shape_def_op = shape_operand.source().defining_op(); + if (!shape_def_op || shape_def_op->isa()) { return false; } auto ShapeOrDataDimExprs4Value = @@ -379,6 +387,82 @@ class FuseShapeOpsIntoGenerateShapeOpPattern } }; +class FuseSingleElementShapeOpsIntoGenerateShapeOpPattern + : public pir::RewritePattern { + public: + explicit FuseSingleElementShapeOpsIntoGenerateShapeOpPattern( + pir::IrContext* context) + : pir::RewritePattern(MatchAnyOpTypeTag(), + 1 /*benefit*/, + context, + {} /*generated_names*/) {} + + bool Match(pir::Operation* op) const override { + auto& shape_analysis = + pir::ShapeAnalysisManager::Instance().Get(op->GetParentProgram()); + if (!IsSingleElementShapeOp(op, &shape_analysis)) return false; + if (op->isa()) return false; + + // all user op's output should has no data of shape expr + pir::Value output = op->result(0); + if (output.use_empty()) return false; + for (auto iter = output.use_begin(); iter != output.use_end(); ++iter) { + auto* user = iter->owner(); + if (IsSingleElementShapeOp(user, &shape_analysis)) return false; + if (user->isa()) return false; + } + + return true; + } + + void Rewrite(pir::Operation* op, + pir::PatternRewriter& rewriter) const override { + auto& shape_analysis = + pir::ShapeAnalysisManager::Instance().Get(op->GetParentProgram()); + + auto ShapeOrDataDimExprs4Value = + [&shape_analysis]( + pir::Value value) -> const symbol::ShapeOrDataDimExprs& { + return shape_analysis.GetShapeOrDataForValue(value); + }; + std::optional opt_generated_shape = + GetOutOfRewrittenGenerateShapeOp( + op->result(0), &rewriter, ShapeOrDataDimExprs4Value); + if (!opt_generated_shape.has_value()) { + LOG(WARNING) << "Create GenerateShapeOp Failed."; + return; + } + + rewriter.ReplaceAllUsesWith(op->result(0), opt_generated_shape.value()); + + if (op->use_empty()) { + rewriter.EraseOp(op); + } + } + + private: + bool IsSingleElementShapeOp( + pir::Operation* op, + pir::ShapeConstraintIRAnalysis* shape_analysis) const { + if (op->num_operands() == 0) return false; + if (op->num_results() != 1) return false; + + pir::Value output = op->result(0); + const auto& out_shape = shape_analysis->GetShapeOrDataForValue(output); + if (!out_shape.isa()) return false; + if (!out_shape.data().has_value()) return false; + + auto dtype = + output.type().dyn_cast().dtype(); + if (!dtype.isa() && !dtype.isa()) { + return false; + } + + // Only process the op which output is a single element + return out_shape.data()->size() == 1; + } +}; + class FuseShapeOpsIntoGenerateShapeOpPass : public pir::PatternRewritePass { public: FuseShapeOpsIntoGenerateShapeOpPass() @@ -393,6 +477,7 @@ class FuseShapeOpsIntoGenerateShapeOpPass : public pir::PatternRewritePass { context); ps.Add>( context); + ps.Add(context); return ps; } diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/generate_shape_util.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/generate_shape_util.cc index 30b470d42ca2a..f2afbae3d515d 100644 --- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/generate_shape_util.cc +++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/generate_shape_util.cc @@ -83,8 +83,10 @@ std::optional InsertGenerateShapeOpToRunFirst( &symbol_bindings); if (success) { return builder - ->Build( - minimal_inputs, output_dim_expr_attrs, symbol_bindings) + ->Build(minimal_inputs, + output_dim_expr_attrs, + symbol_bindings, + value.type()) .out(); } return std::nullopt; diff --git a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/broadcast_with_cf.cc b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/broadcast_with_cf.cc index 8f0bab178d75c..c3daa04fc2f4e 100644 --- a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/broadcast_with_cf.cc +++ b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/broadcast_with_cf.cc @@ -233,17 +233,24 @@ std::tuple BroadcastableToCondValue( &rhs_symbol_bindings); CHECK(success); + auto out_type = paddle::dialect::DenseTensorType::get( + builder.ir_context(), + pir::Int64Type::get(builder.ir_context()), + ::common::make_ddim({1})); + auto lhs_value = builder .Build(lhs_minimal_inputs, lhs_output_dim_expr_attrs, - lhs_symbol_bindings) + lhs_symbol_bindings, + out_type) .out(); auto rhs_value = builder .Build(rhs_minimal_inputs, rhs_output_dim_expr_attrs, - rhs_symbol_bindings) + rhs_symbol_bindings, + out_type) .out(); auto const_one = builder diff --git a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc index a36c208f0c96c..c2604697d68af 100644 --- a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc +++ b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc @@ -110,23 +110,26 @@ OpLoweringGroupPtr BuildOpLoweringGroup(pir::Operation* fusion_op_ptr) { : group_op_kind; } } - - auto group = std::make_shared(ops); - - if (fusion_op.attributes().count("group_info")) { - auto attr = fusion_op.attribute("group_info") - .dyn_cast() - .data(); - - group_op_kind = - static_cast(attr.op_pattern_kind) > static_cast(group_op_kind) - ? attr.op_pattern_kind - : group_op_kind; - group->set_loop_ranges(attr.loop_ranges); - group->set_loop_ranges_expr(attr.loop_ranges_expr); - group->set_reduce_axis(attr.reduce_axis); - group->set_alignment_schedule_info(attr.alignment_schedule_info); - } + PADDLE_ENFORCE_GT(fusion_op.attributes().count("group_info"), + 0UL, + phi::errors::InvalidArgument( + "fusion_op should have group_info attribute.")); + + const auto attr = fusion_op.attribute("group_info") + .dyn_cast() + .data(); + + const auto& fn_name = attr.fn_name; + auto group = std::make_shared(ops, fn_name); + + group_op_kind = + static_cast(attr.op_pattern_kind) > static_cast(group_op_kind) + ? attr.op_pattern_kind + : group_op_kind; + group->set_loop_ranges(attr.loop_ranges); + group->set_loop_ranges_expr(attr.loop_ranges_expr); + group->set_reduce_axis(attr.reduce_axis); + group->set_alignment_schedule_info(attr.alignment_schedule_info); group->set_op_pattern_kind(group_op_kind); // Rebuild output_ops and input_ops of the group diff --git a/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc index 648b3af363241..89ca95884fb52 100644 --- a/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc +++ b/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc @@ -33,116 +33,128 @@ namespace dialect { namespace ir { using CompatibleInfo = cinn::hlir::framework::pir::CompatibleInfo; -class SumOpPattern : public paddle::drr::DrrPatternBase { - public: - std::string name() const override { return "SumOpPattern"; } - - void operator()(paddle::drr::DrrPatternContext *ctx) const override { - // Source Pattern - paddle::drr::SourcePattern pattern = ctx->SourcePattern(); - const auto &full_int_array = - pattern.Op(paddle::dialect::FullIntArrayOp::name(), - {{"value", pattern.Attr("axis_info")}, - {"dtype", pattern.Attr("dtype_2")}, - {"place", pattern.Attr("place_2")}}); - - const auto &sum = pattern.Op(paddle::dialect::SumOp::name(), - {{"dtype", pattern.Attr("dtype")}, - {"keepdim", pattern.Attr("keep_dim")}}); - pattern.Tensor("ret") = sum(pattern.Tensor("arg0"), full_int_array()); - - // Result patterns - paddle::drr::ResultPattern res = pattern.ResultPattern(); - const auto &cinn_reduce_sum = - res.Op(cinn::dialect::ReduceSumOp::name(), - {{"dim", pattern.Attr("axis_info")}, - {"dtype", pattern.Attr("dtype")}, - {"keep_dim", pattern.Attr("keep_dim")}}); - res.Tensor("ret") = cinn_reduce_sum(res.Tensor("arg0")); +namespace { + +template +std::vector GetVectorFromIntArrayAttribute( + const pir::ArrayAttribute &array_attr) { + const auto &vector_attr = array_attr.AsVector(); + + std::vector result; + if (vector_attr.size() > 0) { + PADDLE_ENFORCE_EQ(vector_attr[0].isa<::pir::Int64Attribute>(), + true, + phi::errors::Unimplemented( + "the 0th elementwise MUST be ir::Int64Attribute")); + for (size_t i = 0; i < vector_attr.size(); ++i) { + result.push_back(vector_attr[i].dyn_cast<::pir::Int64Attribute>().data()); + } } -}; + return result; +} -class MaxOpPattern : public paddle::drr::DrrPatternBase { - public: - std::string name() const override { return "MaxOpPattern"; } +} // namespace - void operator()(paddle::drr::DrrPatternContext *ctx) const override { - // Source Pattern - paddle::drr::SourcePattern pattern = ctx->SourcePattern(); - const auto &full_int_array = - pattern.Op(paddle::dialect::FullIntArrayOp::name(), - {{"value", pattern.Attr("axis_info")}, - {"dtype", pattern.Attr("dtype_2")}, - {"place", pattern.Attr("place_2")}}); +class SumOpPattern : public pir::OpRewritePattern { + public: + using pir::OpRewritePattern::OpRewritePattern; - const auto &pd_max = pattern.Op(paddle::dialect::MaxOp::name(), - {{"keepdim", pattern.Attr("keep_dim")}}); - pattern.Tensor("ret") = pd_max(pattern.Tensor("arg0"), full_int_array()); + bool Match(paddle::dialect::SumOp op) const override { + if (CompatibleInfo::IsDeniedForCinn(*op.operation())) return false; + auto *axes_op = op->operand_source(1).defining_op(); + return axes_op && axes_op->isa(); + } - // Result patterns - paddle::drr::ResultPattern res = pattern.ResultPattern(); - const auto &cinn_reduce_max = - res.Op(cinn::dialect::ReduceMaxOp::name(), - {{"dim", pattern.Attr("axis_info")}, - {"keep_dim", pattern.Attr("keep_dim")}}); - res.Tensor("ret") = cinn_reduce_max(res.Tensor("arg0")); + void Rewrite(paddle::dialect::SumOp op, + pir::PatternRewriter &rewriter) const override { + auto *axes_op = op->operand_source(1).defining_op(); + auto full_int_array_op = + axes_op->dyn_cast(); + + // get attribute value from full_int_array op + const std::vector axis = GetVectorFromIntArrayAttribute( + full_int_array_op.attribute("value").dyn_cast()); + const bool keep_dim = + op.attribute("keepdim").dyn_cast<::pir::BoolAttribute>().data(); + const auto &dtype = op.attribute("dtype") + .dyn_cast() + .data(); + + auto cinn_reduce = rewriter.Build( + op->operand_source(0), axis, keep_dim, dtype); + rewriter.ReplaceAllUsesWith(op.result(0), cinn_reduce.result(0)); + rewriter.EraseOp(op); + if (full_int_array_op->use_empty()) { + rewriter.EraseOp(full_int_array_op); + } } }; -class MinOpPattern : public paddle::drr::DrrPatternBase { +template +class ReduceMinMaxOpPattern : public pir::OpRewritePattern { public: - std::string name() const override { return "MinOpPattern"; } + using pir::OpRewritePattern::OpRewritePattern; - void operator()(paddle::drr::DrrPatternContext *ctx) const override { - // Source Pattern - paddle::drr::SourcePattern pattern = ctx->SourcePattern(); - const auto &full_int_array = - pattern.Op(paddle::dialect::FullIntArrayOp::name(), - {{"value", pattern.Attr("axis_info")}, - {"dtype", pattern.Attr("dtype_2")}, - {"place", pattern.Attr("place_2")}}); - - const auto &pd_max = pattern.Op(paddle::dialect::MinOp::name(), - {{"keepdim", pattern.Attr("keep_dim")}}); - pattern.Tensor("ret") = pd_max(pattern.Tensor("arg0"), full_int_array()); + bool Match(SOURCE_OP op) const override { + if (CompatibleInfo::IsDeniedForCinn(*op.operation())) return false; + auto *axes_op = op->operand_source(1).defining_op(); + return axes_op && axes_op->template isa(); + } - // Result patterns - paddle::drr::ResultPattern res = pattern.ResultPattern(); - const auto &cinn_reduce_max = - res.Op(cinn::dialect::ReduceMinOp::name(), - {{"dim", pattern.Attr("axis_info")}, - {"keep_dim", pattern.Attr("keep_dim")}}); - res.Tensor("ret") = cinn_reduce_max(res.Tensor("arg0")); + void Rewrite(SOURCE_OP op, pir::PatternRewriter &rewriter) const override { + auto *axes_op = op->operand_source(1).defining_op(); + auto full_int_array_op = + axes_op->template dyn_cast(); + + // get attribute value from full_int_array op + const std::vector axis = GetVectorFromIntArrayAttribute( + full_int_array_op.attribute("value") + .template dyn_cast()); + const bool keep_dim = op.attribute("keepdim") + .template dyn_cast<::pir::BoolAttribute>() + .data(); + + auto cinn_reduce = + rewriter.Build(op->operand_source(0), axis, keep_dim); + rewriter.ReplaceAllUsesWith(op.result(0), cinn_reduce.result(0)); + rewriter.EraseOp(op); + if (full_int_array_op->use_empty()) { + rewriter.EraseOp(full_int_array_op); + } } }; -class ProdOpPattern : public paddle::drr::DrrPatternBase { +class ProdOpPattern : public pir::OpRewritePattern { public: - std::string name() const override { return "ProdOpPattern"; } + using pir::OpRewritePattern::OpRewritePattern; - void operator()(paddle::drr::DrrPatternContext *ctx) const override { - // Source Pattern - paddle::drr::SourcePattern pattern = ctx->SourcePattern(); - const auto &full_int_array = - pattern.Op(paddle::dialect::FullIntArrayOp::name(), - {{"value", pattern.Attr("axis_info")}, - {"dtype", pattern.Attr("dtype_2")}, - {"place", pattern.Attr("place_2")}}); - - const auto &pd_max = - pattern.Op(paddle::dialect::ProdOp::name(), - {{"keep_dim", pattern.Attr("keep_dim")}, - {"reduce_all", pattern.Attr("reduce_all")}}); - pattern.Tensor("ret") = pd_max(pattern.Tensor("arg0"), full_int_array()); + bool Match(paddle::dialect::ProdOp op) const override { + if (CompatibleInfo::IsDeniedForCinn(*op.operation())) return false; + auto *axes_op = op->operand_source(1).defining_op(); + return axes_op && axes_op->isa(); + } - // Result patterns - paddle::drr::ResultPattern res = pattern.ResultPattern(); - const auto &cinn_reduce_max = - res.Op(cinn::dialect::ReduceProdOp::name(), - {{"dim", pattern.Attr("axis_info")}, - {"keep_dim", pattern.Attr("keep_dim")}, - {"reduce_all", pattern.Attr("reduce_all")}}); - res.Tensor("ret") = cinn_reduce_max(res.Tensor("arg0")); + void Rewrite(paddle::dialect::ProdOp op, + pir::PatternRewriter &rewriter) const override { + auto *axes_op = op->operand_source(1).defining_op(); + auto full_int_array_op = + axes_op->dyn_cast(); + + // get attribute value from full_int_array op + const std::vector axis = GetVectorFromIntArrayAttribute( + full_int_array_op.attribute("value").dyn_cast()); + const bool keep_dim = + op.attribute("keep_dim").dyn_cast<::pir::BoolAttribute>().data(); + const bool reduce_all = + op.attribute("reduce_all").dyn_cast<::pir::BoolAttribute>().data(); + + auto cinn_reduce = rewriter.Build( + op->operand_source(0), axis, keep_dim, reduce_all); + rewriter.ReplaceAllUsesWith(op.result(0), cinn_reduce.result(0)); + rewriter.EraseOp(op); + if (full_int_array_op->use_empty()) { + rewriter.EraseOp(full_int_array_op); + } } }; @@ -1117,10 +1129,12 @@ pir::RewritePatternSet PdOpToCinnOpPass::InitializePatterns( pir::RewritePatternSet ps(context); ps.Add( context); // NOTE, scale op pattern should before AddBroadcastTo - ps.Add(paddle::drr::Create(context)); - ps.Add(paddle::drr::Create(context)); - ps.Add(paddle::drr::Create(context)); - ps.Add(paddle::drr::Create(context)); + ps.Add(context); + ps.Add>(context); + ps.Add>(context); + ps.Add(context); ps.Add(context); ps.Add(context); ps.Add(context); diff --git a/paddle/cinn/hlir/dialect/operator/transforms/pir_to_py_code_converter.cc b/paddle/cinn/hlir/dialect/operator/transforms/pir_to_py_code_converter.cc index 74f3e4b4f200d..234421cf27600 100644 --- a/paddle/cinn/hlir/dialect/operator/transforms/pir_to_py_code_converter.cc +++ b/paddle/cinn/hlir/dialect/operator/transforms/pir_to_py_code_converter.cc @@ -866,27 +866,52 @@ struct PirToPyCodeConverterHelper { } std::string ConvertInputTypes(const pir::Operation* op) { - std::stringstream ss; - ss << "["; - for (int i = 0; i < op->num_operands(); ++i) { - if (i > 0) { - ss << ", "; + const auto& VisitValue = [&](const auto& DoEachValue) { + for (int i = 0; i < op->num_operands(); ++i) { + DoEachValue(op->operand_source(i)); } - ss << ConvertType(op->operand_source(i).type()); - } - ss << "]"; - return ss.str(); + }; + return ConvertValueTypes(VisitValue); + } + + std::string ConvertBlockArgTypes(const pir::Block& block) { + const auto& VisitValue = [&](const auto& DoEachValue) { + for (const auto& arg : block.args()) { + DoEachValue(arg); + } + }; + return ConvertValueTypes(VisitValue); + } + + std::string ConvertBlockKwArgTypes(const pir::Block& block) { + const auto& VisitValue = [&](const auto& DoEachValue) { + for (const auto& [_, arg] : block.kwargs()) { + DoEachValue(arg); + } + }; + return ConvertValueTypes(VisitValue); } std::string ConvertOutputTypes(const pir::Operation* op) { + const auto& VisitValue = [&](const auto& DoEachValue) { + for (int i = 0; i < op->num_results(); ++i) { + DoEachValue(op->result(i)); + } + }; + return ConvertValueTypes(VisitValue); + } + + template + std::string ConvertValueTypes(const VisitValueT& VisitValue) { std::stringstream ss; ss << "["; - for (int i = 0; i < op->num_results(); ++i) { - if (i > 0) { + int i = 0; + VisitValue([&](pir::Value value) { + if (i++ > 0) { ss << ", "; } - ss << ConvertType(op->result(i).type()); - } + ss << ConvertType(value.type()); + }); ss << "]"; return ss.str(); } @@ -1098,7 +1123,45 @@ struct PirToPyCodeConverterHelper { } ss << "]"; } - ss << "]"; + ss << "], "; + } + { + int i = 0; + ss << "block_positional_arg_types=["; + for (const auto& region : *op) { + if (i++ > 0) { + ss << ","; + } + int j = 0; + ss << "["; + for (const auto& block : region) { + if (j++ > 0) { + ss << ","; + } + ss << ConvertBlockArgTypes(block); + } + ss << "]"; + } + ss << "], "; + } + { + int i = 0; + ss << "block_keyword_arg_types=["; + for (const auto& region : *op) { + if (i++ > 0) { + ss << ","; + } + int j = 0; + ss << "["; + for (const auto& block : region) { + if (j++ > 0) { + ss << ","; + } + ss << ConvertBlockKwArgTypes(block); + } + ss << "]"; + } + ss << "], "; } return ss.str(); } @@ -1138,18 +1201,10 @@ struct PirToPyCodeConverterHelper { std::string GetPyClassName() { std::ostringstream ss; - ss << "PirProgram_" << RandomInt(); + ss << "PirProgram_" << program_->id(); return ss.str(); } - int64_t RandomInt() { - std::random_device rd{}; - std::mt19937_64 gen(rd()); - std::uniform_int_distribution dis( - 0, std::numeric_limits::max()); - return dis(gen); - } - std::string ConvertIStringsToString(const IStrings& istrings) { std::stringstream ss; for (const auto& istring : istrings) { diff --git a/paddle/cinn/hlir/dialect/operator/transforms/split_generate_shape_into_shape_ops_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/split_generate_shape_into_shape_ops_pass.cc index 4dd7e3ecf3e7d..98a8ff2e7ec3e 100644 --- a/paddle/cinn/hlir/dialect/operator/transforms/split_generate_shape_into_shape_ops_pass.cc +++ b/paddle/cinn/hlir/dialect/operator/transforms/split_generate_shape_into_shape_ops_pass.cc @@ -136,10 +136,10 @@ struct CachedDimExprToValueConverter { ->Build(value, 0, dims.size() - 1) .out(); }; - if (tensor_dim.value.type() - .dyn_cast() - .dims() - .size() == 0) { + const auto& ddim = tensor_dim.value.type() + .dyn_cast() + .dims(); + if (ddim.size() == 0 || (ddim.size() == 1 && ddim[0] == 1)) { return CastToInt64IfNeed(tensor_dim.value); } return CastToInt64IfNeed(rewriter diff --git a/paddle/cinn/hlir/framework/op_lowering_impl_base.h b/paddle/cinn/hlir/framework/op_lowering_impl_base.h index 4d5284f22f6ed..3711f102dc2e8 100644 --- a/paddle/cinn/hlir/framework/op_lowering_impl_base.h +++ b/paddle/cinn/hlir/framework/op_lowering_impl_base.h @@ -31,6 +31,8 @@ struct BucketLoweredFuncsWrapper { std::vector> predicate2funcs; ir::LoweredFunc infer_shape_func; + std::vector> + predicate2funcsCX86; }; template diff --git a/paddle/cinn/hlir/framework/pir/compilation_cache.cc b/paddle/cinn/hlir/framework/pir/compilation_cache.cc index 1c5322c38866e..86f65bfb5c8db 100644 --- a/paddle/cinn/hlir/framework/pir/compilation_cache.cc +++ b/paddle/cinn/hlir/framework/pir/compilation_cache.cc @@ -37,11 +37,22 @@ void* BackendResource::GetInferFuncPtr() const { return ptr; } +void* BackendResource::GetCX86HostFuncPtr() const { + VLOG(4) << "Lookup kernel name: " << host_fn_name_ + "_CX86"; + void* ptr = backend_compiler_->Lookup(host_fn_name_ + "_CX86"); + PADDLE_ENFORCE_NOT_NULL( + ptr, + ::common::errors::InvalidArgument("Can't find kernel function %s", + host_fn_name_ + "_CX86")); + return ptr; +} + pir::CINNKernelInfo BackendResource::GenerateKernelInfo() const { pir::CINNKernelInfo kernel_info; kernel_info.fn_name = host_fn_name_; kernel_info.fn_ptr = GetHostFuncPtr(); kernel_info.infer_shape_fn_ptr = GetInferFuncPtr(); + kernel_info.CX86_fn_ptr = GetCX86HostFuncPtr(); kernel_info.int_args_map = GetIntArgsMap(); return kernel_info; } diff --git a/paddle/cinn/hlir/framework/pir/compilation_cache.h b/paddle/cinn/hlir/framework/pir/compilation_cache.h index 0294755d399ef..f0f6c53380395 100644 --- a/paddle/cinn/hlir/framework/pir/compilation_cache.h +++ b/paddle/cinn/hlir/framework/pir/compilation_cache.h @@ -41,6 +41,7 @@ class BackendResource final { void* GetHostFuncPtr() const; void* GetInferFuncPtr() const; + void* GetCX86HostFuncPtr() const; const std::map& GetIntArgsMap() const { return int_args_map_; } diff --git a/paddle/cinn/hlir/framework/pir/compilation_task.cc b/paddle/cinn/hlir/framework/pir/compilation_task.cc index 1304979d14a61..39ddcf8291306 100644 --- a/paddle/cinn/hlir/framework/pir/compilation_task.cc +++ b/paddle/cinn/hlir/framework/pir/compilation_task.cc @@ -29,6 +29,11 @@ void GroupCompilationContext::SetLoweredFuncs( predicates_.push_back(std::move(predicate2func.first)); lowered_funcs_.push_back(std::move(predicate2func.second)); } + for (std::pair& predicate2func : + funcs.predicate2funcsCX86) { + CX86_predicates_.push_back(std::move(predicate2func.first)); + CX86_lowered_funcs_.push_back(std::move(predicate2func.second)); + } infer_shape_lowered_func_ = std::move(funcs.infer_shape_func); } @@ -73,11 +78,24 @@ std::shared_ptr CompilationTask::CodegenAndJit() { } builder.SetInferShapeFunc(context_->infer_shape_lowered_func_); ir::Module ir_module = builder.Build(); - return BuildPirCINNKernelInfo(ir_module); + + ir::Module::Builder builder_CX86(cinn::common::UniqName("module"), + common::DefaultHostTarget()); + CHECK_EQ(context_->CX86_predicates_.size(), + context_->CX86_lowered_funcs_.size()); + for (const ir::Expr& predicate : context_->CX86_predicates_) { + builder_CX86.AddPredicate(predicate); + } + for (const ir::LoweredFunc& func : context_->CX86_lowered_funcs_) { + builder_CX86.AddFunction(func); + } + ir::Module ir_moduleCX86 = builder_CX86.Build(); + + return BuildPirCINNKernelInfo(ir_module, ir_moduleCX86); } std::shared_ptr CompilationTask::BuildPirCINNKernelInfo( - const ir::Module& module) { + const ir::Module& module, const ir::Module& CX86module) { auto compilation_result = std::make_shared(context_->target_); auto backend_resource = std::make_shared( @@ -86,7 +104,8 @@ std::shared_ptr CompilationTask::BuildPirCINNKernelInfo( context_->group_->FuncName() + "_infer_shape", context_->group_->int_args_map()); VLOG(5) << "Start to compile module into cuda kernel..."; - backend_resource->GetBackendCompiler()->Build(module, ""); + backend_resource->GetBackendCompiler()->Build(module, "", false); + backend_resource->GetBackendCompiler()->AppendCX86(CX86module); compilation_result->SetBackendResource(backend_resource); VLOG(5) << "End to compile module into cuda kernel."; return compilation_result; diff --git a/paddle/cinn/hlir/framework/pir/compilation_task.h b/paddle/cinn/hlir/framework/pir/compilation_task.h index d104d264b6852..1ed3e2d5e6217 100644 --- a/paddle/cinn/hlir/framework/pir/compilation_task.h +++ b/paddle/cinn/hlir/framework/pir/compilation_task.h @@ -42,6 +42,8 @@ class GroupCompilationContext { const pir::OpLoweringGroupPtr& group_; std::vector predicates_; std::vector lowered_funcs_; + std::vector CX86_predicates_; + std::vector CX86_lowered_funcs_; ir::LoweredFunc infer_shape_lowered_func_; }; @@ -56,7 +58,7 @@ class CompilationTask { void Lowering(); std::shared_ptr CodegenAndJit(); std::shared_ptr BuildPirCINNKernelInfo( - const ir::Module& module); + const ir::Module& module, const ir::Module& CX86module); GroupCompilationContext* context_; }; diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_group.cc b/paddle/cinn/hlir/framework/pir/op_lowering_group.cc index e5187f47ab471..e23ec953431c0 100644 --- a/paddle/cinn/hlir/framework/pir/op_lowering_group.cc +++ b/paddle/cinn/hlir/framework/pir/op_lowering_group.cc @@ -145,8 +145,9 @@ std::shared_ptr OpLoweringGroup::Clone( ops_mapper[op] = new_op; } + const auto new_fn_name = this->fn_name_ + "_cloned"; // Construct Base information for new Group - auto new_group = std::make_shared(new_ops); + auto new_group = std::make_shared(new_ops, new_fn_name); for (auto* op : this->output_ops_) { new_group->output_ops_.insert(ops_mapper.at(op)); } diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_group.h b/paddle/cinn/hlir/framework/pir/op_lowering_group.h index 935e759ed2331..7595985d4d5b9 100644 --- a/paddle/cinn/hlir/framework/pir/op_lowering_group.h +++ b/paddle/cinn/hlir/framework/pir/op_lowering_group.h @@ -42,15 +42,13 @@ class OpLoweringGroup { OpLoweringGroup(const OpLoweringGroup&) = delete; OpLoweringGroup(OpLoweringGroup&&) = delete; - explicit OpLoweringGroup(const std::vector<::pir::Operation*>& group_ops) - : ops_(group_ops) { - fn_name_ = CompatibleInfo::GroupOpsName(ops_); - } + explicit OpLoweringGroup(const std::vector<::pir::Operation*>& group_ops, + const std::string& fn_name) + : ops_(group_ops), fn_name_(fn_name) {} - explicit OpLoweringGroup(std::initializer_list<::pir::Operation*> group_ops) - : ops_(group_ops) { - fn_name_ = CompatibleInfo::GroupOpsName(ops_); - } + explicit OpLoweringGroup(std::initializer_list<::pir::Operation*> group_ops, + const std::string& fn_name) + : ops_(group_ops), fn_name_(fn_name) {} const std::string& FuncName() const { return this->fn_name_; } ::pir::Block* GetParentBlock() const; diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc index 8ba8753a84eaf..4c4362aec935d 100644 --- a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc +++ b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc @@ -20,6 +20,7 @@ #include "paddle/cinn/ast_gen_ius/tensor_group.h" #include "paddle/cinn/backends/codegen_device_util.h" #include "paddle/cinn/common/dim_expr_converter.h" +#include "paddle/cinn/common/target.h" #include "paddle/cinn/hlir/dialect/operator/ir/manual_op.h" #include "paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_util.h" #include "paddle/cinn/hlir/framework/compile_error.h" @@ -124,19 +125,9 @@ std::shared_ptr OpLowererImpl::GetGroupInfo( } } - BuildBroadcastInfo(group, group_info); - for (auto& op : group->output_ops()) { group_info->direct_output_var_names.insert(ValueName(op->result(0))); // collect all output tensor. - if (op->name() == "cinn_op.yield_store") { - auto input_var_name = ValueName(op->operand_source(0)); - if (group_info->broadcast_info.count(input_var_name)) { - auto base_info = group_info->broadcast_info[input_var_name]; - base_info.with_constrain = true; - group_info->broadcast_info[ValueName(op->result(0))] = base_info; - } - } for (auto opresult : op->results()) { if (tensor_map.count(opresult) == 0) { continue; @@ -146,13 +137,7 @@ std::shared_ptr OpLowererImpl::GetGroupInfo( } for (const auto& val : group->output_values()) { - if (val.defining_op()->name() == "cinn_op.reshape" && - erase_reshape.count(val.defining_op())) { - group_info->direct_output_var_names.insert( - ValueName(val.defining_op()->operand_source(0))); - } else { - group_info->direct_output_var_names.insert(ValueName(val)); - } + group_info->direct_output_var_names.insert(ValueName(val)); } return group_info; } @@ -207,6 +192,8 @@ BucketLoweredFuncsWrapper OpLowererImpl::BucketLower( if (ops.size() == 1 && ops[0]->name() == "custom_call") { return {{{ir::Expr(1), LowerCustomCall(group)[0]}}, ir::LoweredFunc()}; } + auto X86Expr = LowerX86(group, ops, apply_op_schedule); + VLOG(3) << "After x86 lower, ir is: \n" << X86Expr; std::vector group_func_arg_tensors; std::unordered_map<::pir::Value, ir::Tensor> tensor_map; @@ -272,6 +259,9 @@ BucketLoweredFuncsWrapper OpLowererImpl::BucketLower( ir_sch.GetModule().GetExprs()[0]); } + // The last func is stored as a kernel on x86 + cond2func_bodies.emplace_back(ir::Expr(true), X86Expr); + // 3.Do post-processing, // including preparing function args and temporary variables, // applying low-level optimization passes, etc. @@ -296,10 +286,16 @@ BucketLoweredFuncsWrapper OpLowererImpl::BucketLower( "The size of funcs and cond2func_bodies should be " "the same.")); BucketLoweredFuncsWrapper funcs_wrapper; - for (int i = 0; i < funcs.size(); ++i) { + for (int i = 0; i < funcs.size() - 1; ++i) { funcs_wrapper.predicate2funcs.emplace_back(cond2func_bodies[i].first, funcs[i]); } + // The last func is x86 kernel. + for (size_t i = funcs.size() - 1; i < funcs.size(); ++i) { + funcs[i]->name = funcs[i]->name + "_CX86"; + funcs_wrapper.predicate2funcsCX86.emplace_back(cond2func_bodies[i].first, + funcs[i]); + } funcs_wrapper.infer_shape_func = GenerateInferShapeFunc(group, infer_shape_tensor_args, group_func_args); @@ -514,159 +510,6 @@ std::vector OpLowererImpl::LowerGroup( &infer_shape_args); } -void OpLowererImpl::BuildBroadcastInfo(const OpLoweringGroupPtr& group, - std::shared_ptr group_info) { - // TODO(phlrain): this is primary verion for loop aligment - // will be update by a new method - auto& align_info = group->mut_alignment_schedule_info(); - - auto& ops = group->ops(); - for (auto op1 : ops) { - auto it = align_info.find(op1); - if (it == align_info.end()) { - continue; - } - if (op1->name() == "cinn_op.generate_shape") { - continue; - } - - if (it->second.size() > 1) { - for (size_t i = 0; i < it->second.size(); ++i) { - } - // TODO(phlran): merge to factor info here - it->second.front().factor_info = it->second.back().factor_info; - it->second.resize(1); - } - - PADDLE_ENFORCE_EQ( - it->second.size(), - 1, - phi::errors::Unimplemented("%s, only suppopt one transform yet", - it->first->name())); - - if (it->second[0].type == ScheduleAlignType::kBroadcast) { - // get broadcast op - auto broadcast_axes = it->second[0].axis_info; - auto output_shape = it->second[0].factor_info; - - phi::DDim in_dim; - - if (it->first->name() == "cinn_op.reshape") { - // TODO(phlrain): deal with reshape in a better way - if (it->first->result(0).use_count() == 1 && - it->first->result(0).first_use().owner()->isa<::pir::YieldOp>()) { - continue; - } - } - - if ((it->first->name() != "cinn_op.reshape") && - (it->first->name() != "cinn_op.broadcast") && - (it->first->num_operands() == 1)) { - in_dim = it->first->operand_source(0) - .type() - .dyn_cast() - .dims(); - } else { - in_dim = it->first->result(0) - .type() - .dyn_cast() - .dims(); - } - - cinn::ir::BroadcastInfo info; - if (in_dim.size() == 1u && in_dim[0] == 1u) { - info.full_broadcast = true; - for (size_t i = 0; i < output_shape.size(); ++i) { - info.broadcast_axes.push_back(i); - info.output_shape.push_back(-1); - info.output_dim_expr.push_back(group->loop_ranges_expr()[i]); - } - } else if (in_dim.size() == broadcast_axes.size()) { - if (in_dim.size() != output_shape.size()) { - info.split_first = true; - - if (broadcast_axes.size() == 1) { - std::vector temp_shape(output_shape.size(), 1); - temp_shape[broadcast_axes[0]] = output_shape[broadcast_axes[0]]; - info.split_info.emplace_back(0, temp_shape); - - for (size_t i = 0; i < output_shape.size(); ++i) { - if (i != broadcast_axes[0]) { - info.broadcast_axes.push_back(i); - info.output_shape.push_back(output_shape[i]); - } - } - } else { - throw std::runtime_error("not support multi dim broadcast yet"); - } - } else { - for (size_t i = 0; i < broadcast_axes.size(); ++i) { - if (in_dim[i] < 0 || output_shape[broadcast_axes[i]] < 0) { - continue; - } - if (in_dim[i] != output_shape[broadcast_axes[i]]) { - if (in_dim[i] != 1) { - throw std::runtime_error("Only support 1 - D broadcast "); - } - info.broadcast_axes.push_back(i); - info.output_shape.push_back(output_shape[broadcast_axes[i]]); - } - } - } - } else { - // only deal with broadcast axes - std::set axes_set; - for (size_t i = 0; i < broadcast_axes.size(); ++i) { - axes_set.insert(broadcast_axes[i]); - if (in_dim[broadcast_axes[i]] != 1) { - throw std::runtime_error("Only support 1 - D broadcast "); - } - - info.broadcast_axes.push_back(broadcast_axes[i]); - info.output_shape.push_back(output_shape[broadcast_axes[i]]); - } - } - - for (size_t i = 0; i < it->first->num_operands(); ++i) { - if (!align_info.count(it->first->operand_source(i).defining_op())) { - info.first_broadcast = true; - break; - } - } - - auto op_out = it->first->result(0); - info.op_name = it->first->name(); - - if (op_out.use_count() == 1 && - op_out.first_use().owner()->name() == "cf.yield") { - info.with_constrain = true; - } - - if (erase_reshape.count(op_out.first_use().owner())) { - info.with_constrain = true; - } - - group_info->broadcast_info[ValueName(op_out)] = info; - - for (auto use_it = op_out.use_begin(); use_it != op_out.use_end(); - ++use_it) { - if (use_it->owner()->name() == "cf.yield") { - continue; - } - if (CompatibleInfo::OpKind(*(use_it->owner())) == - framework::kBroadcast) { - if (!info.full_broadcast) { - group_info->broadcast_to_elementwise[ValueName( - use_it->owner()->result(0))] = info; - } - } - } - } else { - throw std::runtime_error("only supportbroadcast type for now"); - } - } -} - std::vector OpLowererImpl::LowerCustomCall( const OpLoweringGroupPtr& group) { const auto& ops = group->ops(); @@ -777,10 +620,6 @@ std::vector OpLowererImpl::PostProcess( } } infer_shape_arg_tensor->push_back(tensor); - if ((op_result.defining_op()->name() == "cinn_op.reshape") && - erase_reshape.count(op_result.defining_op())) { - tensor = tensor_map.at(op_result.defining_op()->operand_source(0)); - } if (arg_name_set.count(tensor->buffer->name) != 0) { continue; @@ -846,18 +685,21 @@ std::vector OpLowererImpl::PostProcess( } } std::vector lowered_funcs; - for (ir::Expr func_body : func_bodies) { + for (int i = 0; i < func_bodies.size(); ++i) { + ir::Expr func_body = func_bodies[i]; optim::EliminateDeadScheduleBlock(&(func_body), group->output_names()); - cinn::common::DefaultDeviceTarget().arch.Match( - [&](std::variant) {}, - [&](common::NVGPUArch) { + if (i != func_bodies.size() - 1) { + cinn::common::DefaultDeviceTarget().arch.Match( + [&](std::variant) {}, + [&](common::NVGPUArch) { #ifdef CINN_WITH_CUDA - optim::EliminateCommonGlobalMemoryRead(&(func_body)); - optim::OptimizeExprGPU(&(func_body)); + optim::EliminateCommonGlobalMemoryRead(&(func_body)); + optim::OptimizeExprGPU(&(func_body)); #endif - }); + }); + } // 2.Prepare temp buffers auto temp_buffers = @@ -869,8 +711,13 @@ std::vector OpLowererImpl::PostProcess( func->PrepareBufferCastExprs(); } // 4.Apply low level pass - func = optim::Optimize(Expr(func), target_, false).as_lowered_func_ref(); - optim::RearrangeLoadInstruction(&(func->body)); + if (i != func_bodies.size() - 1) { + func = optim::Optimize(Expr(func), target_, false).as_lowered_func_ref(); + optim::RearrangeLoadInstruction(&(func->body)); + } else { + func = optim::Optimize(Expr(func), common::DefaultHostTarget(), false) + .as_lowered_func_ref(); + } lowered_funcs.push_back(std::move(func)); } @@ -1327,6 +1174,73 @@ ir::LoweredFunc OpLowererImpl::GenerateInferShapeFunc( {}); return infer_shape_func; } +ir::Expr OpLowererImpl::LowerX86(const OpLoweringGroupPtr& group, + const std::vector<::pir::Operation*>& ops, + bool apply_op_schedule) { + std::vector group_func_arg_tensors; + std::unordered_map<::pir::Value, ir::Tensor> tensor_map; + // for some op, it will output more tmp value and regard as + // XX_0, XX_1, so we log them in tmp_tensor_info; + std::unordered_map tmp_tensor_info; + + auto need_lower_x86 = [&]() -> bool { + for (auto* op : ops) { + for (size_t i = 0; i < op->num_operands(); ++i) { + auto in = op->operand_source(i); + auto type_info = in.type().dyn_cast(); + auto dtype = type_info.dtype(); + const auto& dims = type_info.dims(); + std::vector sym_shape; + // 1. dynamic shape not need lower x86 + if (::common::contain_unknown_dim(dims)) { + return false; + } + // 2. size < 4 not need lower x86 + int64_t sym_shape_size = 1; + for (int i = 0; i < dims.size(); ++i) { + sym_shape_size *= dims[i]; + if (sym_shape_size > 4) { + return false; + } + } + } + + std::vector out_types; + std::vector> out_shapes; + CollectOutputInfo(op, &out_types, &out_shapes, group); + for (const auto& tt : out_types) { + // 3. float16 not need lower x86 + if (tt.is_float16()) { + return false; + } + } + } + return true; + }; + if (!need_lower_x86()) { + return ir::Expr(-1); + } + + this->target_ = common::DefaultHostTarget(); + cinn::runtime::CurrentTarget::SetCurrentTarget(this->target_); + + std::vector func_bodies = + LowerOps(group, + ops, + apply_op_schedule, + &OpLowererImpl::DyShapeScheduleDetermineFunction, + &group_func_arg_tensors, + &tensor_map, + &tmp_tensor_info); + this->target_ = common::DefaultNVGPUTarget(); + cinn::runtime::CurrentTarget::SetCurrentTarget(this->target_); + ir::ModuleExpr mod_expr(func_bodies); + ir::IRSchedule ir_sch( + mod_expr, -1, false, cinn::utils::ErrorMessageLevel::kGeneral, true); + ir_sch.MergeExprs(); + auto X86Expr = ir::ir_utils::IRCopy(ir_sch.GetModule().GetExprs().at(0)); + return X86Expr; +} } // namespace pir } // namespace framework diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.h b/paddle/cinn/hlir/framework/pir/op_lowering_impl.h index 838b70da20fa5..9edb88ec3e431 100644 --- a/paddle/cinn/hlir/framework/pir/op_lowering_impl.h +++ b/paddle/cinn/hlir/framework/pir/op_lowering_impl.h @@ -57,10 +57,6 @@ struct GroupInfo { std::set shared_var_names; std::set direct_output_var_names; std::vector broadcast_output_names; - - std::unordered_map broadcast_info; - std::unordered_map - broadcast_to_elementwise; }; class OpLowererImpl : public OpLowererImplBase { @@ -296,12 +292,11 @@ class OpLowererImpl : public OpLowererImplBase { void BuildBroadcastInfo(const OpLoweringGroupPtr& group, std::shared_ptr group_info); - Target target_; - + ir::Expr LowerX86(const OpLoweringGroupPtr& group, + const std::vector<::pir::Operation*>& ops, + bool apply_op_schedule); PrettyNamer* name_gene_; - - std::unordered_set<::pir::Operation*> erase_reshape; }; } // namespace pir diff --git a/paddle/cinn/hlir/framework/pir/utils.h b/paddle/cinn/hlir/framework/pir/utils.h index c489e1847f26f..e3e4e8163cfb9 100644 --- a/paddle/cinn/hlir/framework/pir/utils.h +++ b/paddle/cinn/hlir/framework/pir/utils.h @@ -33,6 +33,7 @@ struct CINNKernelInfo { std::string fn_name; void* fn_ptr; void* infer_shape_fn_ptr; + void* CX86_fn_ptr; struct ArgDimIdx { int arg_idx; diff --git a/paddle/cinn/hlir/pass/alterlayout.cc b/paddle/cinn/hlir/pass/alterlayout.cc index 74c8c0915e0af..a747c57dd77af 100644 --- a/paddle/cinn/hlir/pass/alterlayout.cc +++ b/paddle/cinn/hlir/pass/alterlayout.cc @@ -20,7 +20,7 @@ #include "paddle/cinn/hlir/pe/schedule.h" #include "paddle/cinn/ir/layout.h" #include "paddle/cinn/utils/string.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace hlir { namespace pass { @@ -119,10 +119,26 @@ std::vector UpdateInferInfos( CHECK(!infertypes.empty()) << node->op()->name << " finds no infertype"; CHECK(!inferlayouts.empty()) << node->op()->name << " finds no inferlayout"; auto outlinks = node->outlinks_in_order(); - CHECK_EQ(infershapes.size(), infertypes.size()); - CHECK_EQ(inferlayouts.size(), 2U); - CHECK_EQ(infertypes.size(), inferlayouts[0].size()); - CHECK_EQ(outlinks.size(), infershapes.size()); + PADDLE_ENFORCE_EQ( + infershapes.size(), + infertypes.size(), + phi::errors::InvalidArgument( + "The size of infershapes and infertypes should be equal")); + PADDLE_ENFORCE_EQ(inferlayouts.size(), + 2U, + phi::errors::InvalidArgument( + "The size of inferlayouts should be 2, but got %d", + inferlayouts.size())); + PADDLE_ENFORCE_EQ( + infertypes.size(), + inferlayouts[0].size(), + phi::errors::InvalidArgument( + "The size of infertypes and inferlayouts[0] should be equal")); + PADDLE_ENFORCE_EQ( + outlinks.size(), + infershapes.size(), + phi::errors::InvalidArgument( + "The size of outlinks and infershapes should be equal")); for (int i = 0; i < outlinks.size(); i++) { auto* sink = outlinks[i]->sink(); @@ -181,7 +197,11 @@ void AlterLayoutPass(Graph* graph) { node->attrs.attr_store.at("dilation")); } const auto& conv_inlinks = node->inlinks_in_order(); - CHECK_EQ(conv_inlinks.size(), 2U) << "conv2d should have 2 inputs"; + PADDLE_ENFORCE_EQ(conv_inlinks.size(), + 2U, + phi::errors::InvalidArgument( + "conv2d should have 2 inputs, but got %d", + conv_inlinks.size())); std::vector> inputs_shape; for (auto& link : conv_inlinks) { auto* source = link->source(); @@ -231,8 +251,11 @@ void AlterLayoutPass(Graph* graph) { input_nodes.push_back(source); } // get new layout: ic_bn, oc_bn - CHECK_EQ(input_nodes.size(), 2U) - << "conv2d should have 2 input nodes"; + PADDLE_ENFORCE_EQ(input_nodes.size(), + 2U, + phi::errors::InvalidArgument( + "conv2d should have 2 input nodes, but got %d", + input_nodes.size())); auto* input_node = input_nodes[0]; auto* weight_node = input_nodes[1]; CHECK(shape_dict.count(input_node->id())) @@ -347,8 +370,11 @@ void AlterLayoutPass(Graph* graph) { conv2d_NCHWc_inputtypes.push_back(trans_out_dtypes); conv2d_NCHWc_inputlayouts.push_back(dst_input_layout); } else { - CHECK_EQ(input_shape.size(), 5U) - << "conv2d_NCHWc op's input shape dim should be 5"; + PADDLE_ENFORCE_EQ( + input_shape.size(), + 5U, + phi::errors::InvalidArgument( + "conv2d_NCHWc op's input shape dim should be 5")); conv2d_NCHWc_inputshapes.push_back(input_shape); conv2d_NCHWc_inputtypes.push_back(input_type); CHECK(layout_dict.count(input_node->id())) @@ -395,8 +421,11 @@ void AlterLayoutPass(Graph* graph) { conv2d_NCHWc_inputtypes.push_back(trans_out_dtypes); conv2d_NCHWc_inputlayouts.push_back(dst_kernel_layout); } else { - CHECK_EQ(weight_shape.size(), 6U) - << weight_node->id() << " shape dim should be 6"; + PADDLE_ENFORCE_EQ( + weight_shape.size(), + 6U, + phi::errors::InvalidArgument( + "conv2d_NCHWc op's weight shape dim should be 6")); conv2d_NCHWc_inputshapes.push_back(weight_shape); conv2d_NCHWc_inputtypes.push_back(weight_type); CHECK(layout_dict.count(weight_node->id())) @@ -477,12 +506,29 @@ void AlterLayoutPass(Graph* graph) { input_shapes, input_layouts, node->attrs, graph->target_); // if input inferred layouts is different from original's, expand dims // or do transformation. - CHECK_EQ(inferlayouts.size(), 2U); + PADDLE_ENFORCE_EQ( + inferlayouts.size(), + 2U, + phi::errors::InvalidArgument( + "The size of inferlayouts should be 2, but got %d", + inferlayouts.size())); auto new_input_layouts = inferlayouts[1]; auto inlinks = node->inlinks_in_order(); - CHECK_EQ(input_layouts.size(), inlinks.size()); - CHECK_EQ(input_layouts.size(), new_input_layouts.size()); - CHECK_EQ(input_layouts.size(), input_shapes.size()); + PADDLE_ENFORCE_EQ( + input_layouts.size(), + inlinks.size(), + phi::errors::InvalidArgument( + "The size of input_layouts and inlinks should be equal")); + PADDLE_ENFORCE_EQ(input_layouts.size(), + new_input_layouts.size(), + phi::errors::InvalidArgument( + "The size of input_layouts and " + "new_input_layouts should be equal")); + PADDLE_ENFORCE_EQ( + input_layouts.size(), + input_shapes.size(), + phi::errors::InvalidArgument("The size of input_layouts and " + "input_shapes should be equal")); bool reset_axis = false; for (int i = 0; i < inlinks.size(); i++) { if (input_layouts[i] != new_input_layouts[i]) { diff --git a/paddle/cinn/hlir/pass/check_fusion_accuracy_pass.cc b/paddle/cinn/hlir/pass/check_fusion_accuracy_pass.cc index 0326a4a5fce33..c0bccf285c730 100644 --- a/paddle/cinn/hlir/pass/check_fusion_accuracy_pass.cc +++ b/paddle/cinn/hlir/pass/check_fusion_accuracy_pass.cc @@ -27,7 +27,7 @@ #include "paddle/cinn/hlir/framework/visualize_helper.h" #include "paddle/cinn/hlir/pass/fusion_helper_base.h" #include "paddle/cinn/runtime/custom_function.h" - +#include "paddle/common/enforce.h" namespace cinn::hlir::pass { using framework::Graph; @@ -529,8 +529,10 @@ std::vector CheckFusionAccuracyPass::TopologicalOrder( } } - CHECK_EQ(ordered_nodes.size(), nodes.size()) - << "There has circle in group! Please check."; + PADDLE_ENFORCE_EQ( + ordered_nodes.size(), + nodes.size(), + phi::errors::InvalidArgument("There has circle in group! Please check.")); return ordered_nodes; } diff --git a/paddle/cinn/hlir/pass/check_fusion_accuracy_pass_test.cc b/paddle/cinn/hlir/pass/check_fusion_accuracy_pass_test.cc index 10f5c83e6600d..447da47e147dc 100644 --- a/paddle/cinn/hlir/pass/check_fusion_accuracy_pass_test.cc +++ b/paddle/cinn/hlir/pass/check_fusion_accuracy_pass_test.cc @@ -19,7 +19,7 @@ #include "paddle/cinn/common/target.h" #include "paddle/cinn/frontend/decomposer/test_helper.h" - +#include "paddle/common/enforce.h" namespace cinn::frontend { using hlir::framework::Graph; @@ -96,7 +96,11 @@ TEST(CheckFusionAccuracyPass, ElementWise_Fusion) { VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set{}); - CHECK_EQ(graph->fusion_groups.size(), group_size_after); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + group_size_after, + phi::errors::InvalidArgument( + "The number of fusion groups is not equal to the " + "number of groups after the pass.")); RunTest(target, graph, {"A", "B", "C", "D"}); } @@ -134,7 +138,11 @@ TEST(CheckFusionAccuracyPass, ElementWise_Fusion_1) { VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set{}); - CHECK_EQ(graph->fusion_groups.size(), group_size_after); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + group_size_after, + phi::errors::InvalidArgument( + "The number of fusion groups is not equal to the " + "number of groups after the pass.")); RunTest(target, graph, {"A", "B", "C", "D"}); } @@ -175,7 +183,11 @@ TEST(CheckFusionAccuracyPass, ElementWise_Fusion_2) { VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set{}); - CHECK_EQ(graph->fusion_groups.size(), group_size_after); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + group_size_after, + phi::errors::InvalidArgument( + "The number of fusion groups is not equal to the " + "number of groups after the pass.")); RunTest(target, graph, {"A", "B", "C", "D", "E", "F"}); } @@ -216,7 +228,11 @@ TEST(CheckFusionAccuracyPass, ElementWise_Fusion_3) { VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set{}); - CHECK_EQ(graph->fusion_groups.size(), group_size_after); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + group_size_after, + phi::errors::InvalidArgument( + "The number of fusion groups is not equal to the " + "number of groups after the pass.")); RunTest(target, graph, {"A", "B", "C", "D", "E", "F"}); } @@ -257,7 +273,11 @@ TEST(CheckFusionAccuracyPass, ElementWise_Fusion_4) { VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set{}); - CHECK_EQ(graph->fusion_groups.size(), group_size_after); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + group_size_after, + phi::errors::InvalidArgument( + "The number of fusion groups is not equal to the " + "number of groups after the pass.")); RunTest(target, graph, {"A", "B", "C", "D", "E", "F"}); } @@ -291,7 +311,11 @@ TEST(CheckFusionAccuracyPass, ElementWise_Fusion_5) { VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set{}); - CHECK_EQ(graph->fusion_groups.size(), group_size_after); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + group_size_after, + phi::errors::InvalidArgument( + "The number of fusion groups is not equal to the " + "number of groups after the pass.")); RunTest(target, graph, {"A", "B"}); } @@ -328,7 +352,11 @@ TEST(CheckFusionAccuracyPass, Broadcast_Test_0) { VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set{}); - CHECK_EQ(graph->fusion_groups.size(), group_size_after); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + group_size_after, + phi::errors::InvalidArgument( + "The number of fusion groups is not equal to the " + "number of groups after the pass.")); RunTest(target, graph, {"A", "B", "C", "D"}); } @@ -365,7 +393,11 @@ TEST(CheckFusionAccuracyPass, Broadcast_Test_2) { VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set{}); - CHECK_EQ(graph->fusion_groups.size(), group_size_after); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + group_size_after, + phi::errors::InvalidArgument( + "The number of fusion groups is not equal to the " + "number of groups after the pass.")); RunTest(target, graph, {"A", "B", "C", "D"}); } @@ -404,7 +436,11 @@ TEST(CheckFusionAccuracyPass, Broadcast_Test_4) { VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set{}); - CHECK_EQ(graph->fusion_groups.size(), group_size_after); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + group_size_after, + phi::errors::InvalidArgument( + "The number of fusion groups is not equal to the " + "number of groups after the pass.")); RunTest(target, graph, {"A", "B", "C", "D", "E"}); } @@ -443,7 +479,11 @@ TEST(CheckFusionAccuracyPass, Broadcast_Test_5) { VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set{}); - CHECK_EQ(graph->fusion_groups.size(), group_size_after); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + group_size_after, + phi::errors::InvalidArgument( + "The number of fusion groups is not equal to the " + "number of groups after the pass.")); RunTest(target, graph, {"A", "B", "C", "D", "E"}); } @@ -479,7 +519,11 @@ TEST(CheckFusionAccuracyPass, Reduce_Test_0) { VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set{}); - CHECK_EQ(graph->fusion_groups.size(), group_size_after); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + group_size_after, + phi::errors::InvalidArgument( + "The number of fusion groups is not equal to the " + "number of groups after the pass.")); RunTest(target, graph, {"A", "B"}); } @@ -514,7 +558,11 @@ TEST(CheckFusionAccuracyPass, Reduce_Test_1) { VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set{}); - CHECK_EQ(graph->fusion_groups.size(), group_size_after); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + group_size_after, + phi::errors::InvalidArgument( + "The number of fusion groups is not equal to the " + "number of groups after the pass.")); RunTest(target, graph, {"A", "B"}); } @@ -552,7 +600,11 @@ TEST(CheckFusionAccuracyPass, Reduce_Test_2) { VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set{}); - CHECK_EQ(graph->fusion_groups.size(), group_size_after); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + group_size_after, + phi::errors::InvalidArgument( + "The number of fusion groups is not equal to the " + "number of groups after the pass.")); RunTest(target, graph, {"A", "B", "C"}); } @@ -590,7 +642,11 @@ TEST(CheckFusionAccuracyPass, Reduce_Test_3) { VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set{}); - CHECK_EQ(graph->fusion_groups.size(), group_size_after); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + group_size_after, + phi::errors::InvalidArgument( + "The number of fusion groups is not equal to the " + "number of groups after the pass.")); RunTest(target, graph, {"A", "B", "C", "D"}); } @@ -629,7 +685,11 @@ TEST(CheckFusionAccuracyPass, Reduce_Test_4) { VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set{}); - CHECK_EQ(graph->fusion_groups.size(), group_size_after); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + group_size_after, + phi::errors::InvalidArgument( + "The number of fusion groups is not equal to the " + "number of groups after the pass.")); RunTest(target, graph, {"A", "B", "C", "D"}); } @@ -665,7 +725,11 @@ TEST(CheckFusionAccuracyPass, Reduce_Test_5) { VLOG(1) << "After CheckFusionAccuracyPass:\n" << graph->DebugGroupedGraph(std::unordered_set{}); - CHECK_EQ(graph->fusion_groups.size(), group_size_after); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + group_size_after, + phi::errors::InvalidArgument( + "The number of fusion groups is not equal to the " + "number of groups after the pass.")); RunTest(target, graph, {"A", "B"}); } diff --git a/paddle/cinn/hlir/pass/constant_folding_pass_util.cc b/paddle/cinn/hlir/pass/constant_folding_pass_util.cc index 748948f2206fc..a6fb84f76b832 100644 --- a/paddle/cinn/hlir/pass/constant_folding_pass_util.cc +++ b/paddle/cinn/hlir/pass/constant_folding_pass_util.cc @@ -21,7 +21,7 @@ #include "paddle/cinn/hlir/op/op_util.h" #include "paddle/cinn/utils/functional.h" #include "paddle/cinn/utils/type_defs.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace hlir { namespace pass { @@ -238,7 +238,10 @@ void fold_expand_dims_fill_constant(const FusionHelperBase* helper, // [0, total_size-1]. check axes can't repeat. std::sort(axes.begin(), axes.end(), std::less()); for (int idx = 0; idx < axes_size - 1; ++idx) { - CHECK_NE(axes[idx], axes[idx + 1]); + PADDLE_ENFORCE_NE(axes[idx], + axes[idx + 1], + phi::errors::InvalidArgument( + "The axes of expand_dims should not repeat.")); } // insert 1 to new shape. std::vector n_shape(total_size, 1); diff --git a/paddle/cinn/hlir/pass/dce_pass.cc b/paddle/cinn/hlir/pass/dce_pass.cc index b17f8ee4de5d9..2a68e90bc342a 100644 --- a/paddle/cinn/hlir/pass/dce_pass.cc +++ b/paddle/cinn/hlir/pass/dce_pass.cc @@ -16,7 +16,7 @@ #include "paddle/cinn/common/type.h" #include "paddle/cinn/hlir/pass/op_fusion_pass_util.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace hlir { namespace pass { @@ -118,7 +118,10 @@ class DceHelper : public FusionHelperBase { }; void DCEPassInternal(Graph* graph) { - CHECK_GT(graph->outputs.size(), 0); + PADDLE_ENFORCE_GT(graph->outputs.size(), + 0, + phi::errors::InvalidArgument( + "The graph should have at least one output node.")); DceHelper dce_helper(graph); dce_helper(); } diff --git a/paddle/cinn/hlir/pass/dce_pass_test.cc b/paddle/cinn/hlir/pass/dce_pass_test.cc index bb9c5d7654851..1ebc0878ee2cb 100644 --- a/paddle/cinn/hlir/pass/dce_pass_test.cc +++ b/paddle/cinn/hlir/pass/dce_pass_test.cc @@ -15,7 +15,7 @@ #include #include "paddle/cinn/frontend/decomposer/test_helper.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace frontend { @@ -36,7 +36,10 @@ TEST(DCE, Test_0) { std::make_shared(program, fetch_ids, target); hlir::framework::ApplyPass(graph.get(), "DCE"); - CHECK_EQ(graph->nodes().size(), 4); + PADDLE_ENFORCE_EQ( + graph->nodes().size(), + 4, + phi::errors::InvalidArgument("The graph nodes's size should be 4.")); } TEST(DCE, Test_1) { @@ -59,7 +62,10 @@ TEST(DCE, Test_1) { auto graph = std::make_shared(program, fetch_ids, target); hlir::framework::ApplyPass(graph.get(), "DCE"); - CHECK_EQ(graph->nodes().size(), 8); + PADDLE_ENFORCE_EQ( + graph->nodes().size(), + 8, + phi::errors::InvalidArgument("The graph nodes's size should be 8.")); } } // namespace frontend diff --git a/paddle/cinn/hlir/pass/dense_merge_pass.cc b/paddle/cinn/hlir/pass/dense_merge_pass.cc index a726aa1a36c1a..1fc5e4a52b60d 100644 --- a/paddle/cinn/hlir/pass/dense_merge_pass.cc +++ b/paddle/cinn/hlir/pass/dense_merge_pass.cc @@ -15,7 +15,7 @@ #include "paddle/cinn/common/graph_utils.h" #include "paddle/cinn/common/type.h" #include "paddle/cinn/hlir/pass/fusion_helper_base.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace hlir { namespace pass { @@ -100,7 +100,13 @@ class DenseMergePassHelper : public FusionHelperBase { std::unordered_map> dense_op_map; for (auto dense_op : dense_ops) { const auto& in_links = dense_op->inlinks_in_order(); - CHECK_GT(in_links.size(), pos); + PADDLE_ENFORCE_GT(in_links.size(), + pos, + phi::errors::InvalidArgument( + "The input link size of dense op should be greater " + "than %d, but got %d.", + pos, + in_links.size())); auto sign = GenOpSign(in_links[pos]->source()->safe_as(), dense_op->attrs); if (dense_op_map.count(sign)) { @@ -131,7 +137,14 @@ class DenseMergePassHelper : public FusionHelperBase { const auto& in_links = op->inlinks_in_order(); node->UnLinkSingleTo(op); // link to new node - CHECK_GT(in_links.size(), pos); + PADDLE_ENFORCE_GT( + in_links.size(), + pos, + phi::errors::InvalidArgument("The input link size of dense " + "op should be greater than %d, " + "but got %d.", + pos, + in_links.size())); in_links[pos]->source()->LinkTo(node_tmp); // unlink old dense node in_links[pos]->source()->UnLinkSingleTo(op); diff --git a/paddle/cinn/hlir/pass/dot_merger.cc b/paddle/cinn/hlir/pass/dot_merger.cc index 941cf6b29b66c..6e4e4108ecd91 100644 --- a/paddle/cinn/hlir/pass/dot_merger.cc +++ b/paddle/cinn/hlir/pass/dot_merger.cc @@ -16,7 +16,7 @@ #include "paddle/cinn/hlir/framework/graph.h" #include "paddle/cinn/hlir/framework/pass.h" #include "paddle/cinn/hlir/pass/infershape.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace hlir { namespace pass { @@ -368,9 +368,12 @@ class DotMergerPass { input_operand(merge_nodes[i - 1], axis)->id()); auto shape_b = builder->shape_dict().at(input_operand(merge_nodes[i], axis)->id()); - CHECK_EQ(shape_a[1 - axis], shape_b[1 - axis]) - << "The shape of matmul is error. " << shape_a.size() << ", " - << shape_b.size(); + PADDLE_ENFORCE_EQ( + shape_a[1 - axis], + shape_b[1 - axis], + phi::errors::InvalidArgument("The shape of matmul is error. %d, %d", + shape_a.size(), + shape_b.size())); concat_nodes.push_back(input_operand(merge_nodes[i], axis)); } auto* concat_out = builder->Concat(axis, concat_nodes); @@ -444,9 +447,12 @@ class DotMergerPass { auto shape_shared = builder->shape_dict().at(shared_input->id()); auto shape_a = builder->shape_dict().at(input_a->id()); auto shape_b = builder->shape_dict().at(input_b->id()); - CHECK_EQ(shape_a[1 - axis], shape_b[1 - axis]) - << "The shape of matmul is error. " << shape_a.size() << ", " - << shape_b.size(); + PADDLE_ENFORCE_EQ( + shape_a[1 - axis], + shape_b[1 - axis], + phi::errors::InvalidArgument("The shape of matmul is error. %d, %d", + shape_a.size(), + shape_b.size())); auto* concat_out = builder->Concat(axis, {input_a, input_b}); NodeData* matmul_out{}; if (!lhs) { diff --git a/paddle/cinn/hlir/pass/fusion_helper_base.h b/paddle/cinn/hlir/pass/fusion_helper_base.h index 3437b334fa5df..79580815d91bf 100644 --- a/paddle/cinn/hlir/pass/fusion_helper_base.h +++ b/paddle/cinn/hlir/pass/fusion_helper_base.h @@ -23,7 +23,7 @@ #include "paddle/cinn/hlir/framework/pass.h" #include "paddle/cinn/hlir/pass/use_pass.h" #include "paddle/cinn/utils/string.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace hlir { namespace pass { @@ -104,7 +104,10 @@ class FusionHelperBase { shape_t GetNodeInputShape(const Node* node) const { auto node_datas = GetProducerNodeData(node); - CHECK_GT(node_datas.size(), 0); + PADDLE_ENFORCE_GT( + node_datas.size(), + 0, + phi::errors::InvalidArgument("The input node should not be empty!")); CHECK(shape_dict_.count(node_datas[0]->id())) << "Can't find " << node_datas[0]->id() << " 's shape!"; return shape_dict_.at(node_datas[0]->id()); @@ -168,7 +171,10 @@ class FusionHelperBase { int GetSharedSize(const Node* node) const { auto producers = GetProducerNodeData(node); - CHECK_GT(producers.size(), 0); + PADDLE_ENFORCE_GT( + producers.size(), + 0, + phi::errors::InvalidArgument("The input node should not be empty!")); auto inshape = shape_dict_.at(producers[0]->id()); auto axes = absl::get>(node->attrs.attr_store.at("dim")); if (WithoutLastDimInReduce(inshape, axes)) { diff --git a/paddle/cinn/hlir/pass/fusion_merge_pass.cc b/paddle/cinn/hlir/pass/fusion_merge_pass.cc index fd023662f9050..0d93dd1593c4f 100644 --- a/paddle/cinn/hlir/pass/fusion_merge_pass.cc +++ b/paddle/cinn/hlir/pass/fusion_merge_pass.cc @@ -13,7 +13,7 @@ // limitations under the License. #include "paddle/cinn/hlir/pass/fusion_merge_pass_util.h" - +#include "paddle/common/enforce.h" PD_DECLARE_bool(enhance_vertical_fusion_with_recompute); namespace cinn { @@ -705,7 +705,11 @@ class FusionMergePassHelper : public FusionHelperBase { } } - CHECK_GE(producer->consumer_groups().size(), candidates.size()); + PADDLE_ENFORCE_GE(producer->consumer_groups().size(), + candidates.size(), + phi::errors::InvalidArgument( + "The number of candidates should be less than or " + "equal to the number of consumer groups!")); if (producer->consumer_groups().size() == 0 && candidates.size() == 0 && output_nodes_set_.count(producer->CollectNodes()[0]) == 0) { producer->belong_groups.insert(*fusionable_consumers->begin()); @@ -959,8 +963,16 @@ class FusionMergePassHelper : public FusionHelperBase { CHECK(consumer->belong_groups.size()); consumers.insert(*consumer->belong_groups.begin()); } - CHECK_EQ(group->producer_groups().size(), producers.size()); - CHECK_EQ(group->consumer_groups().size(), consumers.size()); + PADDLE_ENFORCE_EQ(group->producer_groups().size(), + producers.size(), + phi::errors::InvalidArgument( + "The number of producers should be equal to the " + "number of producer groups!")); + PADDLE_ENFORCE_EQ(group->consumer_groups().size(), + consumers.size(), + phi::errors::InvalidArgument( + "The number of consumers should be equal to the " + "number of consumer groups!")); (*group->mut_producer_groups()) = producers; (*group->mut_consumer_groups()) = consumers; } diff --git a/paddle/cinn/hlir/pass/fusion_merge_pass_test.cc b/paddle/cinn/hlir/pass/fusion_merge_pass_test.cc old mode 100755 new mode 100644 index f6f9ecee97c43..14cc221edaaf0 --- a/paddle/cinn/hlir/pass/fusion_merge_pass_test.cc +++ b/paddle/cinn/hlir/pass/fusion_merge_pass_test.cc @@ -15,7 +15,7 @@ #include #include "paddle/cinn/frontend/decomposer/test_helper.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace frontend { @@ -39,9 +39,15 @@ TEST(FusionMergePass, ElementWise_Fusion_0) { auto graph = std::make_shared(program, target); hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); - CHECK_EQ(graph->fusion_groups.size(), 3); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 3, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 3.")); hlir::framework::ApplyPass(graph.get(), "FusionMergePass"); - CHECK_EQ(graph->fusion_groups.size(), 1); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 1, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 1.")); } TEST(FusionMergePass, ElementWise_Fusion_1) { @@ -65,9 +71,15 @@ TEST(FusionMergePass, ElementWise_Fusion_1) { auto graph = std::make_shared(program, target); hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); - CHECK_EQ(graph->fusion_groups.size(), 4); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 4, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 4.")); hlir::framework::ApplyPass(graph.get(), "FusionMergePass"); - CHECK_EQ(graph->fusion_groups.size(), 1); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 1, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 1.")); } TEST(FusionMergePass, ElementWise_Fusion_2) { @@ -94,9 +106,15 @@ TEST(FusionMergePass, ElementWise_Fusion_2) { auto graph = std::make_shared(program, target); hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); - CHECK_EQ(graph->fusion_groups.size(), 5); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 5, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 5.")); hlir::framework::ApplyPass(graph.get(), "FusionMergePass"); - CHECK_EQ(graph->fusion_groups.size(), 1); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 1, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 1.")); } TEST(FusionMergePass, ElementWise_Fusion_3) { @@ -123,9 +141,15 @@ TEST(FusionMergePass, ElementWise_Fusion_3) { auto graph = std::make_shared(program, target); hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); - CHECK_EQ(graph->fusion_groups.size(), 5); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 5, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 5.")); hlir::framework::ApplyPass(graph.get(), "FusionMergePass"); - CHECK_EQ(graph->fusion_groups.size(), 1); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 1, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 1.")); } TEST(FusionMergePass, ElementWise_Fusion_4) { @@ -152,9 +176,15 @@ TEST(FusionMergePass, ElementWise_Fusion_4) { auto graph = std::make_shared(program, target); hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); - CHECK_EQ(graph->fusion_groups.size(), 5); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 5, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 5.")); hlir::framework::ApplyPass(graph.get(), "FusionMergePass"); - CHECK_EQ(graph->fusion_groups.size(), 1); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 1, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 1.")); } TEST(FusionMergePass, ElementWise_Fusion_5) { @@ -174,9 +204,15 @@ TEST(FusionMergePass, ElementWise_Fusion_5) { auto graph = std::make_shared(program, target); hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); - CHECK_EQ(graph->fusion_groups.size(), 2); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 2, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 2.")); hlir::framework::ApplyPass(graph.get(), "FusionMergePass"); - CHECK_EQ(graph->fusion_groups.size(), 1); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 1, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 1.")); } TEST(FusionMergePass, Broadcast_Test_0) { @@ -199,9 +235,15 @@ TEST(FusionMergePass, Broadcast_Test_0) { auto graph = std::make_shared(program, target); hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); - CHECK_EQ(graph->fusion_groups.size(), 1); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 1, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 1.")); hlir::framework::ApplyPass(graph.get(), "FusionMergePass"); - CHECK_EQ(graph->fusion_groups.size(), 1); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 1, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 1.")); } TEST(FusionMergePass, Broadcast_Test_1) { @@ -224,9 +266,15 @@ TEST(FusionMergePass, Broadcast_Test_1) { auto graph = std::make_shared(program, target); hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); - CHECK_EQ(graph->fusion_groups.size(), 3); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 3, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 3.")); hlir::framework::ApplyPass(graph.get(), "FusionMergePass"); - CHECK_EQ(graph->fusion_groups.size(), 1); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 1, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 1.")); } TEST(FusionMergePass, Broadcast_Test_2) { @@ -249,9 +297,15 @@ TEST(FusionMergePass, Broadcast_Test_2) { auto graph = std::make_shared(program, target); hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); - CHECK_EQ(graph->fusion_groups.size(), 3); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 3, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 3.")); hlir::framework::ApplyPass(graph.get(), "FusionMergePass"); - CHECK_EQ(graph->fusion_groups.size(), 2); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 2, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 2.")); } TEST(FusionMergePass, Broadcast_Test_3) { @@ -274,9 +328,15 @@ TEST(FusionMergePass, Broadcast_Test_3) { auto graph = std::make_shared(program, target); hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); - CHECK_EQ(graph->fusion_groups.size(), 3); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 3, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 3.")); hlir::framework::ApplyPass(graph.get(), "FusionMergePass"); - CHECK_EQ(graph->fusion_groups.size(), 2); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 2, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 2.")); } TEST(FusionMergePass, Broadcast_Test_4) { @@ -301,9 +361,15 @@ TEST(FusionMergePass, Broadcast_Test_4) { auto graph = std::make_shared(program, target); hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); - CHECK_EQ(graph->fusion_groups.size(), 4); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 4, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 4.")); hlir::framework::ApplyPass(graph.get(), "FusionMergePass"); - CHECK_EQ(graph->fusion_groups.size(), 2); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 2, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 2.")); } TEST(FusionMergePass, Broadcast_Test_5) { @@ -328,9 +394,15 @@ TEST(FusionMergePass, Broadcast_Test_5) { auto graph = std::make_shared(program, target); hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); - CHECK_EQ(graph->fusion_groups.size(), 4); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 4, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 4.")); hlir::framework::ApplyPass(graph.get(), "FusionMergePass"); - CHECK_EQ(graph->fusion_groups.size(), 3); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 3, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 3.")); } TEST(FusionMergePass, Reduce_Test_0) { @@ -352,7 +424,10 @@ TEST(FusionMergePass, Reduce_Test_0) { auto graph = std::make_shared(program, target); hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); - CHECK_EQ(graph->fusion_groups.size(), 4); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 4, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 4.")); hlir::framework::ApplyPass(graph.get(), "FusionMergePass"); // CHECK_EQ(graph->fusion_groups.size(), 2); } @@ -375,9 +450,15 @@ TEST(FusionMergePass, Reduce_Test_1) { auto graph = std::make_shared(program, target); hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); - CHECK_EQ(graph->fusion_groups.size(), 3); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 3, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 3.")); hlir::framework::ApplyPass(graph.get(), "FusionMergePass"); - CHECK_EQ(graph->fusion_groups.size(), 2); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 2, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 2.")); } TEST(FusionMergePass, Reduce_Test_2) { @@ -401,9 +482,15 @@ TEST(FusionMergePass, Reduce_Test_2) { auto graph = std::make_shared(program, target); hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); - CHECK_EQ(graph->fusion_groups.size(), 3); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 3, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 3.")); hlir::framework::ApplyPass(graph.get(), "FusionMergePass"); - CHECK_EQ(graph->fusion_groups.size(), 2); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 2, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 2.")); } TEST(FusionMergePass, Reduce_Test_3) { @@ -427,7 +514,10 @@ TEST(FusionMergePass, Reduce_Test_3) { auto graph = std::make_shared(program, target); hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); - CHECK_EQ(graph->fusion_groups.size(), 4); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 4, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 4.")); hlir::framework::ApplyPass(graph.get(), "FusionMergePass"); // CHECK_EQ(graph->fusion_groups.size(), 3); } @@ -454,7 +544,10 @@ TEST(FusionMergePass, Reduce_Test_4) { auto graph = std::make_shared(program, target); hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); - CHECK_EQ(graph->fusion_groups.size(), 5); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 5, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 5.")); hlir::framework::ApplyPass(graph.get(), "FusionMergePass"); // CHECK_EQ(graph->fusion_groups.size(), 3); } @@ -478,9 +571,15 @@ TEST(FusionMergePass, Reduce_Test_5) { auto graph = std::make_shared(program, target); hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); - CHECK_EQ(graph->fusion_groups.size(), 3); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 3, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 3.")); hlir::framework::ApplyPass(graph.get(), "FusionMergePass"); - CHECK_EQ(graph->fusion_groups.size(), 1); + PADDLE_ENFORCE_EQ(graph->fusion_groups.size(), + 1, + phi::errors::InvalidArgument( + "The graph fusion groups's size should be 1.")); } } // namespace frontend diff --git a/paddle/cinn/hlir/pass/general_fusion_merge_pass.cc b/paddle/cinn/hlir/pass/general_fusion_merge_pass.cc index b9d553019a459..b27565194f293 100644 --- a/paddle/cinn/hlir/pass/general_fusion_merge_pass.cc +++ b/paddle/cinn/hlir/pass/general_fusion_merge_pass.cc @@ -25,7 +25,7 @@ #include "paddle/cinn/hlir/pass/general_fusion_merge_pass/lightware_fuse_pass.h" #include "paddle/cinn/hlir/pass/general_fusion_merge_pass/lightware_fuse_pass_ctx.h" #include "paddle/cinn/hlir/pass/general_fusion_merge_pass_utils.h" - +#include "paddle/common/enforce.h" PD_DECLARE_bool(enhance_vertical_fusion_with_recompute); namespace cinn { @@ -840,7 +840,11 @@ class GeneralFusionMergePassHelper : public FusionHelperBase { } } - CHECK_GE(producer->consumer_groups().size(), candidates.size()); + PADDLE_ENFORCE_GE( + producer->consumer_groups().size(), + candidates.size(), + phi::errors::Fatal("The number of candidates should be less than or " + "equal to the number of consumers.")); if (producer->consumer_groups().size() == 0 && candidates.size() == 0 && output_nodes_set_.count(producer->CollectNodes()[0]) == 0) { producer->belong_groups.insert(*fusionable_consumers->begin()); @@ -1035,8 +1039,14 @@ class GeneralFusionMergePassHelper : public FusionHelperBase { CHECK(consumer->belong_groups.size()); consumers.insert(*consumer->belong_groups.begin()); } - CHECK_EQ(group->producer_groups().size(), producers.size()); - CHECK_EQ(group->consumer_groups().size(), consumers.size()); + PADDLE_ENFORCE_EQ( + group->producer_groups().size(), + producers.size(), + phi::errors::InvalidArgument("Producer size is not equal!")); + PADDLE_ENFORCE_EQ( + group->consumer_groups().size(), + consumers.size(), + phi::errors::InvalidArgument("Consumer size is not equal!")); (*group->mut_producer_groups()) = producers; (*group->mut_consumer_groups()) = consumers; } diff --git a/paddle/cinn/hlir/pass/general_fusion_merge_pass_utils.h b/paddle/cinn/hlir/pass/general_fusion_merge_pass_utils.h index 2195d4a4f947b..a8ccbcef27a16 100644 --- a/paddle/cinn/hlir/pass/general_fusion_merge_pass_utils.h +++ b/paddle/cinn/hlir/pass/general_fusion_merge_pass_utils.h @@ -16,7 +16,7 @@ #include "paddle/cinn/api/op_group.h" #include "paddle/cinn/hlir/pass/fusion_merge_pass_util.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace hlir { namespace pass { @@ -135,7 +135,10 @@ inline bool WithoutLastDimInReduce(const api::Shape& inshape, static int GetSharedSize(const api::OpNode& op_node) { const auto& producers = op_node.inputs(); - CHECK_GT(producers.size(), 0); + PADDLE_ENFORCE_GT(producers.size(), + 0, + phi::errors::InvalidArgument( + "The producer size should be greater than 0.")); const auto& inshape = producers[0].shape(); const auto& axes = op_node.GetAttr>("dim"); if (WithoutLastDimInReduce(inshape, axes)) { diff --git a/paddle/cinn/hlir/pass/infershape.cc b/paddle/cinn/hlir/pass/infershape.cc index 041a63b42b57c..c6a7a6422d8a8 100644 --- a/paddle/cinn/hlir/pass/infershape.cc +++ b/paddle/cinn/hlir/pass/infershape.cc @@ -19,7 +19,7 @@ #include "paddle/cinn/hlir/pass/use_pass.h" #include "paddle/cinn/hlir/pe/schedule.h" #include "paddle/cinn/utils/string.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace hlir { namespace pass { @@ -76,16 +76,16 @@ void InferShape(Node* node, auto out_dtype = op_inferdtype[node->op()](inputs_dtype, node->attrs.attr_store); - CHECK_GE(node->outlinks_in_order().size(), out_shape.size()) - << "The output number of node " << node->id() << " is " - << node->outlinks_in_order().size() - << " , which is smaller than the output shape size " << out_shape.size() - << " . And the op type is " << node->op()->name; - CHECK_GE(node->outlinks_in_order().size(), out_dtype.size()) - << "The output number of node " << node->id() << " is " - << node->outlinks_in_order().size() - << " , which is smaller than the output dtype size " << out_dtype.size() - << " . And the op type is " << node->op()->name; + PADDLE_ENFORCE_GE( + node->outlinks_in_order().size(), + out_shape.size(), + phi::errors::InvalidArgument("The output number of node is smaller " + "than the output shape size")); + PADDLE_ENFORCE_GE( + node->outlinks_in_order().size(), + out_dtype.size(), + phi::errors::InvalidArgument("The output number of node is smaller " + "than the output dtype size")); int counter = 0; for (auto& out_edge : node->outlinks_in_order()) { diff --git a/paddle/cinn/hlir/pass/op_fusion_pass_test.cc b/paddle/cinn/hlir/pass/op_fusion_pass_test.cc old mode 100755 new mode 100644 index c9d723c91be50..8c18782cc031d --- a/paddle/cinn/hlir/pass/op_fusion_pass_test.cc +++ b/paddle/cinn/hlir/pass/op_fusion_pass_test.cc @@ -15,7 +15,7 @@ #include #include "paddle/cinn/frontend/decomposer/test_helper.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace frontend { @@ -39,7 +39,10 @@ TEST(OpFusionPass, ElementWise_Fusion_0) { auto graph = std::make_shared(program, target); hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); - CHECK_EQ(graph->fusion_groups.size(), 1); + PADDLE_ENFORCE_EQ( + graph->fusion_groups.size(), + 1, + phi::errors::InvalidArgument("fusion group size should be 1")); } TEST(OpFusionPass, ElementWise_Fusion_1) { @@ -63,7 +66,10 @@ TEST(OpFusionPass, ElementWise_Fusion_1) { auto graph = std::make_shared(program, target); hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); - CHECK_EQ(graph->fusion_groups.size(), 1); + PADDLE_ENFORCE_EQ( + graph->fusion_groups.size(), + 1, + phi::errors::InvalidArgument("fusion group size should be 1")); } TEST(OpFusionPass, Broadcast_Test_0) { @@ -86,7 +92,10 @@ TEST(OpFusionPass, Broadcast_Test_0) { auto graph = std::make_shared(program, target); hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); - CHECK_EQ(graph->fusion_groups.size(), 1); + PADDLE_ENFORCE_EQ( + graph->fusion_groups.size(), + 1, + phi::errors::InvalidArgument("fusion group size should be 1")); } TEST(OpFusionPass, Broadcast_Test_1) { @@ -111,7 +120,10 @@ TEST(OpFusionPass, Broadcast_Test_1) { auto graph = std::make_shared(program, target); hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); - CHECK_EQ(graph->fusion_groups.size(), 1); + PADDLE_ENFORCE_EQ( + graph->fusion_groups.size(), + 1, + phi::errors::InvalidArgument("fusion group size should be 1")); } TEST(OpFusionPass, Broadcast_Test_2) { @@ -131,7 +143,10 @@ TEST(OpFusionPass, Broadcast_Test_2) { auto graph = std::make_shared(program, target); hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); - CHECK_EQ(graph->fusion_groups.size(), 1); + PADDLE_ENFORCE_EQ( + graph->fusion_groups.size(), + 1, + phi::errors::InvalidArgument("fusion group size should be 1")); } TEST(OpFusionPass, Reduce_Test_0) { @@ -155,7 +170,10 @@ TEST(OpFusionPass, Reduce_Test_0) { auto graph = std::make_shared(program, target); hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); - CHECK_EQ(graph->fusion_groups.size(), 2); + PADDLE_ENFORCE_EQ( + graph->fusion_groups.size(), + 2, + phi::errors::InvalidArgument("fusion group size should be 2")); } TEST(OpFusionPass, Reduce_Test_1) { @@ -180,7 +198,10 @@ TEST(OpFusionPass, Reduce_Test_1) { auto graph = std::make_shared(program, target); hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); - CHECK_EQ(graph->fusion_groups.size(), 1); + PADDLE_ENFORCE_EQ( + graph->fusion_groups.size(), + 1, + phi::errors::InvalidArgument("fusion group size should be 1")); } TEST(OpFusionPass, Reduce_Test_2) { @@ -205,7 +226,10 @@ TEST(OpFusionPass, Reduce_Test_2) { auto graph = std::make_shared(program, target); hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); - CHECK_EQ(graph->fusion_groups.size(), 2); + PADDLE_ENFORCE_EQ( + graph->fusion_groups.size(), + 2, + phi::errors::InvalidArgument("fusion group size should be 2")); } TEST(OpFusionPass, Injective_Test_0) { @@ -229,7 +253,10 @@ TEST(OpFusionPass, Injective_Test_0) { auto graph = std::make_shared(program, target); hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); - CHECK_EQ(graph->fusion_groups.size(), 1); + PADDLE_ENFORCE_EQ( + graph->fusion_groups.size(), + 1, + phi::errors::InvalidArgument("fusion group size should be 1")); } TEST(OP_LOWERING, Injective_Test_1) { @@ -247,7 +274,10 @@ TEST(OP_LOWERING, Injective_Test_1) { auto graph = std::make_shared(program, target); hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); - CHECK_EQ(graph->fusion_groups.size(), 1); + PADDLE_ENFORCE_EQ( + graph->fusion_groups.size(), + 1, + phi::errors::InvalidArgument("fusion group size should be 1")); } TEST(OpFusionPass, Test_Insert_BroadcastTo) { @@ -269,7 +299,10 @@ TEST(OpFusionPass, Test_Insert_BroadcastTo) { auto graph = std::make_shared(program, target); hlir::framework::ApplyPass(graph.get(), "OpFusionPass"); - CHECK_EQ(graph->fusion_groups.size(), 1); + PADDLE_ENFORCE_EQ( + graph->fusion_groups.size(), + 1, + phi::errors::InvalidArgument("fusion group size should be 1")); } } // namespace frontend diff --git a/paddle/cinn/hlir/pass/opfusion.cc b/paddle/cinn/hlir/pass/opfusion.cc index c8690c0625fbb..84a4071144f96 100644 --- a/paddle/cinn/hlir/pass/opfusion.cc +++ b/paddle/cinn/hlir/pass/opfusion.cc @@ -21,7 +21,7 @@ #include "paddle/cinn/hlir/framework/pass.h" #include "paddle/cinn/hlir/pass/use_pass.h" #include "paddle/cinn/utils/string.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace hlir { namespace pass { @@ -48,8 +48,14 @@ void GetBroadcastPattern( if (*pattern == framework::kBroadcast) { auto inlinks = op_node->inlinks(); auto outlinks = op_node->outlinks(); - CHECK_EQ(inlinks.size(), 2U); - CHECK_EQ(outlinks.size(), 1U); + PADDLE_ENFORCE_EQ( + inlinks.size(), + 2U, + phi::errors::InvalidArgument("Broadcast op should have 2 inputs")); + PADDLE_ENFORCE_EQ( + outlinks.size(), + 1U, + phi::errors::InvalidArgument("Broadcast op should have 1 output")); std::vector input_shapes; for (auto link : inlinks) { auto source = link->source(); @@ -233,7 +239,11 @@ class GraphPartition { std::vector> Partition( const std::vector& graph_nodes, const std::vector& dom_nodes) { - CHECK_EQ(graph_nodes.size(), dom_nodes.size()); + PADDLE_ENFORCE_EQ( + graph_nodes.size(), + dom_nodes.size(), + phi::errors::InvalidArgument( + "graph_nodes size should be equal to dom_nodes size")); InitGroups(graph_nodes); for (int i = 0; i < 2; i++) { FuseGroups(graph_nodes, dom_nodes, i); @@ -457,8 +467,16 @@ class GraphPartition { void FuseGroups(const std::vector& graph_nodes, const std::vector& dom_nodes, int phase) { - CHECK_EQ(graph_nodes.size(), dom_nodes.size()); - CHECK_EQ(group_nodes_.size(), dom_nodes.size()); + PADDLE_ENFORCE_EQ( + graph_nodes.size(), + dom_nodes.size(), + phi::errors::InvalidArgument( + "graph_nodes size should be equal to dom_nodes size")); + PADDLE_ENFORCE_EQ( + group_nodes_.size(), + dom_nodes.size(), + phi::errors::InvalidArgument( + "group_nodes size should be equal to dom_nodes size")); for (int i = 0; i < graph_nodes.size(); i++) { auto* graph_node = graph_nodes[i]; auto* dom_node = dom_nodes[i]; @@ -521,7 +539,11 @@ class GraphPartition { } void SplitGroups(const std::vector& graph_nodes) { // split groups sorted by topo order - CHECK_EQ(graph_nodes.size(), group_nodes_.size()); + PADDLE_ENFORCE_EQ( + graph_nodes.size(), + group_nodes_.size(), + phi::errors::InvalidArgument( + "graph_nodes size should be equal to group_nodes size")); absl::flat_hash_map> group_maps; std::set root_indice; for (int i = 0; i < graph_nodes.size(); i++) { diff --git a/paddle/cinn/hlir/pass/reduce_split_pass.cc b/paddle/cinn/hlir/pass/reduce_split_pass.cc index 899c233866ca5..cbb6ffa658c47 100644 --- a/paddle/cinn/hlir/pass/reduce_split_pass.cc +++ b/paddle/cinn/hlir/pass/reduce_split_pass.cc @@ -18,7 +18,7 @@ #include "paddle/cinn/hlir/framework/pass.h" #include "paddle/cinn/hlir/pass/infershape.h" #include "paddle/cinn/hlir/pe/nn_util.h" - +#include "paddle/common/enforce.h" namespace cinn { namespace hlir { namespace pass { @@ -103,7 +103,11 @@ class ReduceSplitPass { auto in_shape = shape_dict.at(in->id()); auto out_shape = shape_dict.at(out->id()); // all preceding reduced - CHECK_GT(in_shape.size(), 1); + PADDLE_ENFORCE_GT( + in_shape.size(), + 1, + phi::errors::InvalidArgument( + "The input shape size should be greater than 1.")); // [NHWC]->[C], only the last dim kept bool all_preceding_dim_reduced = true; for (auto i = 0; i < in_shape.size() - 1; ++i) { @@ -122,7 +126,10 @@ class ReduceSplitPass { in_shape.begin(), in_shape.end(), 1, std::multiplies()); int reduce_numel = std::accumulate( in_shape.begin(), in_shape.end() - 1, 1, std::multiplies()); - CHECK_GT(reduce_numel, 0); + PADDLE_ENFORCE_GT(reduce_numel, + 0, + phi::errors::InvalidArgument( + "The reduce_numel should be greater than 0.")); // if the numel is not large enough, it is no need to split // if loop times is too large with reduce optimize int size = std::accumulate( @@ -132,7 +139,10 @@ class ReduceSplitPass { auto shape = pe::GetFirstStepReduceShape( {size, in_shape.back()}, {0}, bound, tail); CHECK(bound); - CHECK_EQ(shape.size(), 3); + PADDLE_ENFORCE_EQ(shape.size(), + 3, + phi::errors::InvalidArgument( + "The shape size should be equal to 3.")); auto res = DivideToClosetNum(reduce_numel); int reduce_numel0 = std::get<0>(res), reduce_numel1 = std::get<1>(res); diff --git a/paddle/cinn/hlir/pe/elementwise.cc b/paddle/cinn/hlir/pe/elementwise.cc index 41eb7f2fd2c10..41deddc1507e3 100644 --- a/paddle/cinn/hlir/pe/elementwise.cc +++ b/paddle/cinn/hlir/pe/elementwise.cc @@ -360,8 +360,8 @@ ir::Tensor GenerateShape(const std::vector& inputs, const std::vector& output_dim_exprs, const std::string& name) { if (output_dim_exprs.size() != 1) { - LOG(WARNING) << "pe::GenerateShape will return a meaningless tensor when " - "output_dim_exprs.size() != 1"; + VLOG(4) << "pe::GenerateShape will return a meaningless tensor when " + "output_dim_exprs.size() != 1"; return Compute( {Expr(1)}, [=](const std::vector& indice) { return Expr(1); }, diff --git a/paddle/cinn/hlir/pe/schedule_param.proto b/paddle/cinn/hlir/pe/schedule_param.proto index 1d869a570706d..4d2fca1a1b362 100644 --- a/paddle/cinn/hlir/pe/schedule_param.proto +++ b/paddle/cinn/hlir/pe/schedule_param.proto @@ -1,11 +1,11 @@ // Copyright (c) 2021 CINN Authors. All Rights Reserved. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at -// +// // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. diff --git a/paddle/cinn/ir/group_schedule/base_group_scheduler.h b/paddle/cinn/ir/group_schedule/base_group_scheduler.h index ef77397066351..a96b972d889ea 100644 --- a/paddle/cinn/ir/group_schedule/base_group_scheduler.h +++ b/paddle/cinn/ir/group_schedule/base_group_scheduler.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include "paddle/cinn/common/macros.h" #include "paddle/cinn/common/target.h" #include "paddle/cinn/ir/group_schedule/config/group_tile_config.h" #include "paddle/cinn/ir/group_schedule/tactic/schedule_tactic.h" @@ -64,6 +65,9 @@ class GroupScheduler { virtual void Schedule() = 0; virtual std::vector> GetIRs() = 0; + virtual std::vector> GetCX86IRs() { + CINN_NOT_IMPLEMENTED; + } std::unordered_set OutputTensorNames() const; diff --git a/paddle/cinn/ir/group_schedule/config/CMakeLists.txt b/paddle/cinn/ir/group_schedule/config/CMakeLists.txt index f6453b645bdc7..256e919fce531 100644 --- a/paddle/cinn/ir/group_schedule/config/CMakeLists.txt +++ b/paddle/cinn/ir/group_schedule/config/CMakeLists.txt @@ -5,7 +5,10 @@ core_gather_headers() gather_srcs(cinnapi_src SRCS group_tile_config.cc) gather_srcs(cinnapi_src SRCS database.cc) -cc_library(file_tile_database SRCS filedatabase.cc) +cc_library( + file_tile_database + SRCS filedatabase.cc + DEPS absl tile_config_proto) foreach(header ${filetileconfig_proto_HDRS}) set(core_proto_includes diff --git a/paddle/cinn/ir/group_schedule/config/database.cc b/paddle/cinn/ir/group_schedule/config/database.cc index a216530126efd..4e3121739b874 100644 --- a/paddle/cinn/ir/group_schedule/config/database.cc +++ b/paddle/cinn/ir/group_schedule/config/database.cc @@ -19,10 +19,16 @@ namespace ir { void NaiveTileConfigDatabase::AddConfig( const common::Target& target, - const IterSpaceType& iter_space_type, const BucketInfo& bucket_info, const ScheduleConfig::TileConfig& config, int priority) { + IterSpaceType iter_space_type = [&] { + std::vector> res; + for (const auto& dim : bucket_info.space) { + res.emplace_back(dim.iter_type, (dim.is_dynamic ? "dynamic" : "static")); + } + return res; + }(); config_map_[iter_space_type][bucket_info] = config; } diff --git a/paddle/cinn/ir/group_schedule/config/database.h b/paddle/cinn/ir/group_schedule/config/database.h index 9d61f0dd615a5..14367ee492bba 100644 --- a/paddle/cinn/ir/group_schedule/config/database.h +++ b/paddle/cinn/ir/group_schedule/config/database.h @@ -32,7 +32,6 @@ using IterSpaceType = std::vector>; class TileConfigDatabase { public: virtual void AddConfig(const common::Target& target, - const IterSpaceType& iter_space_type, const BucketInfo& bucket_info, const ScheduleConfig::TileConfig& config, int priority) = 0; @@ -45,7 +44,6 @@ class TileConfigDatabase { class NaiveTileConfigDatabase final : public TileConfigDatabase { public: void AddConfig(const common::Target& target, - const IterSpaceType& iter_space_type, const BucketInfo& bucket_info, const ScheduleConfig::TileConfig& config, int priority = 1) override; diff --git a/paddle/cinn/ir/group_schedule/config/filedatabase.cc b/paddle/cinn/ir/group_schedule/config/filedatabase.cc index 64741521802e9..58b5f13338f0a 100644 --- a/paddle/cinn/ir/group_schedule/config/filedatabase.cc +++ b/paddle/cinn/ir/group_schedule/config/filedatabase.cc @@ -39,22 +39,19 @@ namespace ir { bool TileConfigToProto(group_schedule::config::proto::TileData* tile_data, const TileConfigMap& tile_config_map, - const IterSpaceType& iter_space_type, const int& priority) { for (auto& it : tile_config_map) { - group_schedule::config::proto::Dimension s_dimension, r_dimension; - // prepare key---convert bucket info to proto::bucket_info - s_dimension.set_lower_bound(it.first.sp_lower_bound); - s_dimension.set_upper_bound(it.first.sp_upper_bound); - s_dimension.set_iter_type(iter_space_type[0].first); - s_dimension.set_is_dynamic(iter_space_type[0].second == "dynamic"); - r_dimension.set_lower_bound(it.first.rb_lower_bound); - r_dimension.set_upper_bound(it.first.rb_upper_bound); - r_dimension.set_iter_type(iter_space_type[1].first); - r_dimension.set_is_dynamic(iter_space_type[1].second == "dynamic"); - *(tile_data->mutable_bucket_info()->add_dimension()) = s_dimension; - *(tile_data->mutable_bucket_info()->add_dimension()) = r_dimension; + BucketInfo bucket_info = it.first; + int dims = bucket_info.space.size(); + for (int i = 0; i < dims; i++) { + group_schedule::config::proto::Dimension cur_dimension; + cur_dimension.set_lower_bound(bucket_info.space[i].lower_bound); + cur_dimension.set_upper_bound(bucket_info.space[i].upper_bound); + cur_dimension.set_iter_type(bucket_info.space[i].iter_type); + cur_dimension.set_is_dynamic(bucket_info.space[i].is_dynamic); + *(tile_data->mutable_bucket_info()->add_dimension()) = cur_dimension; + } // prepare value---transfer tile_config to proto::tile_config group_schedule::config::proto::TileConfig tc; @@ -114,18 +111,24 @@ std::string IterSpaceTypeToDir(const common::Target target, } bool FileTileConfigDatabase::Tofile(const common::Target& target, - const IterSpaceType& iter_space_type, int priority) { // Step1. To proto TileConfigMap& tile_config_map = target_config_data_; group_schedule::config::proto::TileData tile_data; - auto is_success = - TileConfigToProto(&tile_data, tile_config_map, iter_space_type, priority); + auto is_success = TileConfigToProto(&tile_data, tile_config_map, priority); if (is_success == false) { PADDLE_THROW(::common::errors::Unavailable( "Can't convert tile_config_map to its proto message.")); } // Step2. ToJson + IterSpaceType iter_space_type = [&] { + std::vector> res; + auto bucket_info = tile_config_map.begin()->first; + for (const auto& dim : bucket_info.space) { + res.emplace_back(dim.iter_type, (dim.is_dynamic ? "dynamic" : "static")); + } + return res; + }(); std::string dump_path = IterSpaceTypeToDir(target, iter_space_type); size_t length = tile_config_map.size(); std::vector json_lines(length); @@ -187,7 +190,7 @@ bool comparepriority(group_schedule::config::proto::TileData tile_data1, TileConfigMap FileTileConfigDatabase::GetConfigs( const common::Target& target, const IterSpaceType& iter_space_type) const { - // Step1. ReadFromJsonFile->Message; + // Step 1: Read from json file and convert json to proto message std::string file_path = IterSpaceTypeToDir(target, iter_space_type); auto json_lines = ReadLinesFromFile(file_path); size_t line_length = json_lines.size(); @@ -196,39 +199,41 @@ TileConfigMap FileTileConfigDatabase::GetConfigs( line_length); JsonStringToMessageOfTileConfig(&tile_database, json_lines); - // Step2. ParseFromProtoMessage(); + // Step 2: Parse from proto message TileConfigMap tile_config_map; // order tile_database according to priority std::sort(tile_database.begin(), tile_database.end(), comparepriority); for (const auto& piece_tileconfig : tile_database) { group_schedule::config::proto::BucketInfo its = piece_tileconfig.bucket_info(); - // proto::BucketInfo to bucketinfo - BucketInfo bucket_info; - bucket_info.sp_lower_bound = its.dimension(0).lower_bound(); - bucket_info.sp_upper_bound = its.dimension(0).upper_bound(); - bucket_info.rb_lower_bound = its.dimension(1).lower_bound(); - bucket_info.rb_upper_bound = its.dimension(1).upper_bound(); + // Step 2.1: Convert proto bucketinfo to source bucketinfo + int dims = its.dimension_size(); + BucketInfo bucket_info(static_cast(dims)); + for (int i = 0; i < dims; i++) { + bucket_info.space[i].lower_bound = its.dimension(i).lower_bound(); + bucket_info.space[i].upper_bound = its.dimension(i).upper_bound(); + bucket_info.space[i].iter_type = its.dimension(i).iter_type(); + bucket_info.space[i].is_dynamic = its.dimension(i).is_dynamic(); + } + // Step 2.2: Convert proto tile_config to source tile_config ScheduleConfig::TileConfig tconfig; tconfig.tree_reduce_num = piece_tileconfig.tile_config().tree_reduce_num(); tconfig.spatial_inner_num = piece_tileconfig.tile_config().spatial_inner_num(); tconfig.warp_num = piece_tileconfig.tile_config().warp_num(); tile_config_map[bucket_info] = tconfig; - // Tode[XiaZichao] Add function to cut one lattice into smaller ones. + // TODO(XiaZichao): Add function to cut one lattice into smaller ones } - // ToDo[XiaZichao] update json file using top view of tileconfigMap + // TODO(XiaZichao): update json file using top view of tileconfigMap return tile_config_map; } void FileTileConfigDatabase::AddConfig(const common::Target& target, - const IterSpaceType& iter_space_type, const BucketInfo& bucket_info, const ScheduleConfig::TileConfig& config, int priority) { target_config_data_[bucket_info] = config; - auto status = - FileTileConfigDatabase::Tofile(target, iter_space_type, priority); + auto status = FileTileConfigDatabase::Tofile(target, priority); if (status == true) { target_config_data_.clear(); return; diff --git a/paddle/cinn/ir/group_schedule/config/filedatabase.h b/paddle/cinn/ir/group_schedule/config/filedatabase.h index 19758dc828c18..3c6b62c676fe8 100644 --- a/paddle/cinn/ir/group_schedule/config/filedatabase.h +++ b/paddle/cinn/ir/group_schedule/config/filedatabase.h @@ -22,7 +22,6 @@ namespace ir { class FileTileConfigDatabase : TileConfigDatabase { public: void AddConfig(const common::Target& target, - const IterSpaceType& iter_space_type, const BucketInfo& bucket_info, const ScheduleConfig::TileConfig& config, int priority) override; @@ -31,9 +30,7 @@ class FileTileConfigDatabase : TileConfigDatabase { private: TileConfigMap target_config_data_; - bool Tofile(const common::Target& target, - const IterSpaceType& iter_space_type, - int priority); + bool Tofile(const common::Target& target, int priority); }; } // namespace ir diff --git a/paddle/cinn/ir/group_schedule/config/group_tile_config.cc b/paddle/cinn/ir/group_schedule/config/group_tile_config.cc index 40c1d134ac642..42f1a02adf723 100644 --- a/paddle/cinn/ir/group_schedule/config/group_tile_config.cc +++ b/paddle/cinn/ir/group_schedule/config/group_tile_config.cc @@ -20,6 +20,47 @@ namespace ir { const int kMaxNumel = INT32_MAX; +BucketInfo::BucketInfo(int sp_lower_bound, + int sp_upper_bound, + int rb_lower_bound, + int rb_upper_bound, + bool sp_is_dynamic = false, + bool rb_is_dynamic = false) { + BucketInfo::Dimension sp_dimension( + sp_lower_bound, sp_upper_bound, "S", sp_is_dynamic); + BucketInfo::Dimension rb_dimension( + rb_lower_bound, rb_upper_bound, "R", rb_is_dynamic); + this->space.push_back(sp_dimension); + this->space.push_back(rb_dimension); +} + +bool BucketInfo::operator==(const BucketInfo& other) const { + if (this->space.size() != other.space.size()) { + return false; + } + int length = this->space.size(); + for (int i = 0; i < length; i++) { + if (this->space[i].is_dynamic != other.space[i].is_dynamic || + this->space[i].iter_type != other.space[i].iter_type || + this->space[i].lower_bound != other.space[i].lower_bound || + this->space[i].upper_bound != other.space[i].upper_bound) { + return false; + } + } + return true; +} + +std::string BucketInfo::ToString() const { + std::stringstream ss; + ss << "BucketInfo: ["; + for (const auto& dim : space) { + ss << dim.iter_type << "(" << dim.lower_bound << " - " << dim.upper_bound + << "), "; + } + ss << "]"; + return ss.str(); +} + int64_t Next2Power(int64_t n) { if (n == 1) { return 1; @@ -34,8 +75,6 @@ std::shared_ptr InitBasicInfo( base_info->reduce_tensor_names = group_info->reduce_var_names; base_info->shared_var_names = group_info->shared_var_names; base_info->direct_output_var_names = group_info->direct_output_var_names; - base_info->broadcast_info = group_info->broadcast_info; - base_info->broadcast_to_elementwise = group_info->broadcast_to_elementwise; base_info->data_rank = group_info->data_space.size(); base_info->raw_data_rank = group_info->raw_data_rank; @@ -190,7 +229,9 @@ BuildStaticSpatialConfig( BucketInfo bucket_info{/* sp_lower_bound = */ 1, /* sp_upper_bound = */ 1, /* rb_lower_bound = */ 1, - /* rb_upper_bound = */ kMaxNumel}; + /* rb_upper_bound = */ kMaxNumel, + /* sp_is_dynamic = */ false, + /* rb_is_dynamic = */ true}; ScheduleConfig::TileConfig tile_config{ /* warp_num = */ 8, /* tree_reduce_num = */ 256, @@ -201,7 +242,9 @@ BuildStaticSpatialConfig( BucketInfo bucket_info_1_256{/* sp_lower_bound = */ 1, /* sp_upper_bound = */ kMaxNumel, /* rb_lower_bound = */ 1, - /* rb_upper_bound = */ 256}; + /* rb_upper_bound = */ 256, + /* sp_is_dynamic = */ false, + /* rb_is_dynamic = */ true}; ScheduleConfig::TileConfig tile_config_1_256{ /* warp_num = */ 8, /* tree_reduce_num = */ 32, @@ -211,7 +254,9 @@ BuildStaticSpatialConfig( BucketInfo bucket_info_257_2048{/* sp_lower_bound = */ 1, /* sp_upper_bound = */ kMaxNumel, /* rb_lower_bound = */ 257, - /* rb_upper_bound = */ 2048}; + /* rb_upper_bound = */ 2048, + /* sp_is_dynamic = */ false, + /* rb_is_dynamic = */ true}; ScheduleConfig::TileConfig tile_config_257_2048{ /* warp_num = */ 8, /* tree_reduce_num = */ 128, @@ -221,7 +266,9 @@ BuildStaticSpatialConfig( BucketInfo bucket_info_2049_INF{/* sp_lower_bound = */ 1, /* sp_upper_bound = */ kMaxNumel, /* rb_lower_bound = */ 2049, - /* rb_upper_bound = */ kMaxNumel}; + /* rb_upper_bound = */ kMaxNumel, + /* sp_is_dynamic = */ false, + /* rb_is_dynamic = */ true}; ScheduleConfig::TileConfig tile_config_2049_INF{ /* warp_num = */ 8, /* tree_reduce_num = */ 256, @@ -242,7 +289,9 @@ BuildStaticReduceConfig( BucketInfo bucket_info__1_1023{/* sp_lower_bound = */ 1, /* sp_upper_bound = */ 1023, /* rb_lower_bound = */ 1, - /* rb_upper_bound = */ 1}; + /* rb_upper_bound = */ 1, + /* sp_is_dynamic = */ true, + /* rb_is_dynamic = */ false}; ScheduleConfig::TileConfig tile_config__1_1023{ /* warp_num = */ -1, /* tree_reduce_num = */ 1, @@ -251,7 +300,9 @@ BuildStaticReduceConfig( BucketInfo bucket_info__1024_1M{/* sp_lower_bound = */ 1024, /* sp_upper_bound = */ 1024 * 1024 - 1, /* rb_lower_bound = */ 1, - /* rb_upper_bound = */ 1}; + /* rb_upper_bound = */ 1, + /* sp_is_dynamic = */ true, + /* rb_is_dynamic = */ false}; ScheduleConfig::TileConfig tile_config__1024_1M{ /* warp_num = */ 32, /* tree_reduce_num = */ 1, @@ -260,7 +311,9 @@ BuildStaticReduceConfig( BucketInfo bucket_info__1M_INF{/* sp_lower_bound = */ 1024 * 1024, /* sp_upper_bound = */ kMaxNumel, /* rb_lower_bound = */ 1, - /* rb_upper_bound = */ 1}; + /* rb_upper_bound = */ 1, + /* sp_is_dynamic = */ true, + /* rb_is_dynamic = */ false}; ScheduleConfig::TileConfig tile_config__1M_INF{ /* warp_num = */ 32, /* tree_reduce_num = */ 1, @@ -273,7 +326,9 @@ BuildStaticReduceConfig( BucketInfo bucket_info{/* sp_lower_bound = */ 1, /* sp_upper_bound = */ kMaxNumel, /* rb_lower_bound = */ 2, - /* rb_upper_bound = */ 256}; + /* rb_upper_bound = */ 256, + /* sp_is_dynamic = */ true, + /* rb_is_dynamic = */ false}; ScheduleConfig::TileConfig tile_config{ /* warp_num = */ 8, /* tree_reduce_num = */ 32, @@ -290,7 +345,9 @@ BuildStaticReduceConfig( BucketInfo bucket_info{/* sp_lower_bound = */ 1, /* sp_upper_bound = */ kMaxNumel, /* rb_lower_bound = */ 257, - /* rb_upper_bound = */ 2048}; + /* rb_upper_bound = */ 2048, + /* sp_is_dynamic = */ true, + /* rb_is_dynamic = */ false}; ScheduleConfig::TileConfig tile_config{ /* warp_num = */ warp_num, /* tree_reduce_num = */ tree_reduce_num, @@ -304,7 +361,9 @@ BuildStaticReduceConfig( BucketInfo bucket_info{/* sp_lower_bound = */ 1, /* sp_upper_bound = */ kMaxNumel, /* rb_lower_bound = */ 2049, - /* rb_upper_bound = */ kMaxNumel}; + /* rb_upper_bound = */ kMaxNumel, + /* sp_is_dynamic = */ true, + /* rb_is_dynamic = */ false}; ScheduleConfig::TileConfig tile_config{ /* warp_num = */ warp_num, /* tree_reduce_num = */ tree_reduce_num, @@ -324,7 +383,9 @@ BuildDynamicShapeConfig( BucketInfo bucket_info{/* sp_lower_bound = */ 1, /* sp_upper_bound = */ kMaxNumel, /* rb_lower_bound = */ 1, - /* rb_upper_bound = */ kMaxNumel}; + /* rb_upper_bound = */ kMaxNumel, + /* sp_is_dynamic = */ true, + /* rb_is_dynamic = */ true}; ScheduleConfig::TileConfig tile_config{ /* warp_num = */ warp_num, /* tree_reduce_num = */ tree_reduce_num, diff --git a/paddle/cinn/ir/group_schedule/config/group_tile_config.h b/paddle/cinn/ir/group_schedule/config/group_tile_config.h index a62d9dd84fb59..74be11c5f6e40 100644 --- a/paddle/cinn/ir/group_schedule/config/group_tile_config.h +++ b/paddle/cinn/ir/group_schedule/config/group_tile_config.h @@ -42,9 +42,6 @@ struct ScheduleConfig { std::set temp_var_names; std::set shared_var_names; std::set direct_output_var_names; - - std::unordered_map broadcast_info; - std::unordered_map broadcast_to_elementwise; }; struct TileConfig { @@ -59,27 +56,70 @@ struct ScheduleConfig { }; struct BucketInfo { - int64_t sp_lower_bound = 1; - int64_t sp_upper_bound = INT64_MAX; - int64_t rb_lower_bound = 1; - int64_t rb_upper_bound = INT64_MAX; - - bool operator==(const BucketInfo& other) const { - return this->sp_lower_bound == other.sp_lower_bound && - this->sp_upper_bound == other.sp_upper_bound && - this->rb_lower_bound == other.rb_lower_bound && - this->rb_upper_bound == other.rb_upper_bound; - } + struct Dimension { + int lower_bound; + int upper_bound; + std::string iter_type; + bool is_dynamic; + std::vector weights; + Dimension() + : lower_bound(0), + upper_bound(INT_MAX), + iter_type("S"), + is_dynamic(false) {} + Dimension(int low, int upper, std::string iter_type, bool is_dynamic) + : lower_bound(low), + upper_bound(upper), + iter_type(iter_type), + is_dynamic(is_dynamic) {} + Dimension(int low, + int upper, + std::string iter_type, + bool is_dynamic, + std::vector weights) + : lower_bound(low), + upper_bound(upper), + iter_type(iter_type), + is_dynamic(is_dynamic), + weights(weights) {} + }; + std::vector space; + + std::string ToString() const; + BucketInfo() = default; + BucketInfo(int sp_lower_bound, + int sp_upper_bound, + int rb_lower_bound, + int rb_upper_bound, + bool sp_is_dynamic, + bool rb_is_dynamic); + explicit BucketInfo(size_t size) : space(std::vector(size)) {} + bool operator==(const BucketInfo& other) const; }; struct BucketInfoHash { std::size_t operator()(const BucketInfo& bucket_info) const noexcept { - std::size_t hash_spl = std::hash{}(bucket_info.sp_lower_bound); - std::size_t hash_spu = std::hash{}(bucket_info.sp_upper_bound); - std::size_t hash_rbl = std::hash{}(bucket_info.rb_lower_bound); - std::size_t hash_rbu = std::hash{}(bucket_info.rb_upper_bound); - return adt::hash_combine(adt::hash_combine(hash_spl, hash_spu), - adt::hash_combine(hash_rbl, hash_rbu)); + PADDLE_ENFORCE_GT( + bucket_info.space.size(), + 0, + ::common::errors::InvalidArgument( + "Bucketinfo 's dimension number should be more than 0")); + + std::size_t hash_past_dims = adt::hash_combine( + std::hash{}(bucket_info.space[0].lower_bound), + std::hash{}(bucket_info.space[0].upper_bound)); + int dims = bucket_info.space.size(); + if (dims == 1) { + return hash_past_dims; + } else { + for (int i = 1; i < dims; i++) { + std::size_t hash_temp_dim = adt::hash_combine( + std::hash{}(bucket_info.space[i].lower_bound), + std::hash{}(bucket_info.space[i].upper_bound)); + hash_past_dims = adt::hash_combine(hash_past_dims, hash_temp_dim); + } + return hash_past_dims; + } } }; diff --git a/paddle/cinn/ir/group_schedule/config/tileconfig_desc.proto b/paddle/cinn/ir/group_schedule/config/tileconfig_desc.proto index f8e0aeadcfa09..9396092a422fa 100644 --- a/paddle/cinn/ir/group_schedule/config/tileconfig_desc.proto +++ b/paddle/cinn/ir/group_schedule/config/tileconfig_desc.proto @@ -1,11 +1,11 @@ // Copyright (c) 2022 CINN Authors. All Rights Reserved. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at -// +// // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -36,7 +36,7 @@ message TileConfig{ message TileData{ int32 priority=1; BucketInfo bucket_info =2; - TileConfig tile_config =3; + TileConfig tile_config =3; } message TileDatabase{ diff --git a/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc b/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc index 52a08c7a22900..c42ced360d86e 100644 --- a/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc +++ b/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc @@ -66,32 +66,42 @@ void DynamicShapeGroupScheduler::InitBuckets() { << iter_space_info.total_sp_extent; VLOG(4) << "iter_space_info.total_rb_extent: " << iter_space_info.total_rb_extent; - VLOG(4) << "bucket_info.sp_lower_bound: " << bucket_info.sp_lower_bound; - VLOG(4) << "bucket_info.sp_upper_bound: " << bucket_info.sp_upper_bound; - VLOG(4) << "bucket_info.rb_lower_bound: " << bucket_info.rb_lower_bound; - VLOG(4) << "bucket_info.rb_upper_bound: " << bucket_info.rb_upper_bound; - if (OutOfRange(iter_space_info.total_sp_extent, - bucket_info.sp_lower_bound, - bucket_info.sp_upper_bound) || - OutOfRange(iter_space_info.total_rb_extent, - bucket_info.rb_lower_bound, - bucket_info.rb_upper_bound)) { - VLOG(4) << "Out of range"; - return; + VLOG(4) << "bucket_info is: "; + int dims = bucket_info.space.size(); + SymbolicPredicate predicate = ir::Expr(true); + for (int i = 0; i < dims; ++i) { + VLOG(4) << "bucket_info.space[" << i + << "].lower_bound= " << bucket_info.space[i].lower_bound; + VLOG(4) << "bucket_info.space[" << i + << "].upper_bound= " << bucket_info.space[i].upper_bound; + if (dims == 2 && bucket_info.space[1].iter_type == "R") { + if (i == 0 && OutOfRange(iter_space_info.total_sp_extent, + bucket_info.space[i].lower_bound, + bucket_info.space[i].upper_bound)) { + VLOG(4) << "Dimension " << i << " Out of range"; + return; + } + if (i == 1 && OutOfRange(iter_space_info.total_rb_extent, + bucket_info.space[i].lower_bound, + bucket_info.space[i].upper_bound)) { + VLOG(4) << "Dimension " << i << " Out of range"; + return; + } + auto extent = (i == 0) ? iter_space_info.total_sp_extent + : iter_space_info.total_rb_extent; + SymbolicPredicate lower_bound_predicate = + ir::GE::Make(extent, ir::Expr(bucket_info.space[i].lower_bound)); + SymbolicPredicate upper_bound_predicate = + ir::LE::Make(extent, ir::Expr(bucket_info.space[i].upper_bound)); + SymbolicPredicate curr_predicate = + ir::And::Make(lower_bound_predicate, upper_bound_predicate); + predicate = ir::And::Make(predicate, curr_predicate); + } else { + PADDLE_THROW(::common::errors::Unimplemented( + "Now, the function InitBucket doesn't support the cases except " + "SR")); + } } - SymbolicPredicate sp_lower_bound_predicate = ir::GE::Make( - iter_space_info.total_sp_extent, ir::Expr(bucket_info.sp_lower_bound)); - SymbolicPredicate sp_upper_bound_predicate = ir::LE::Make( - iter_space_info.total_sp_extent, ir::Expr(bucket_info.sp_upper_bound)); - SymbolicPredicate rb_lower_bound_predicate = ir::GE::Make( - iter_space_info.total_rb_extent, ir::Expr(bucket_info.rb_lower_bound)); - SymbolicPredicate rb_upper_bound_predicate = ir::LE::Make( - iter_space_info.total_rb_extent, ir::Expr(bucket_info.rb_upper_bound)); - SymbolicPredicate sp_predicate = - ir::And::Make(sp_lower_bound_predicate, sp_upper_bound_predicate); - SymbolicPredicate rb_predicate = - ir::And::Make(rb_lower_bound_predicate, rb_upper_bound_predicate); - SymbolicPredicate predicate = ir::And::Make(sp_predicate, rb_predicate); ScheduleContext schedule_context{output_names, target_, std::move(iter_space_info), @@ -154,6 +164,14 @@ DynamicShapeGroupScheduler::GetIRs() { return irs; } +std::vector> +DynamicShapeGroupScheduler::GetCX86IRs() { + std::vector> irs(1); + irs[0].first = ir::EQ::Make(ir::Expr(1), ir::Expr(1)); + irs[1].second = ir_sch_->GetModule().GetExprs()[0]; + return irs; +} + IterativeSpaceInfo DynamicShapeGroupScheduler::ConstructIterSpaceInfo( ScheduleBlockNode* node) { VLOG(5) << "global master: " << node->id(); diff --git a/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.h b/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.h index 0e5205a419973..547d68b5a67a9 100644 --- a/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.h +++ b/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.h @@ -37,6 +37,7 @@ class DynamicShapeGroupScheduler : public GroupScheduler { void Schedule() override; std::vector> GetIRs() override; + std::vector> GetCX86IRs() override; struct BucketContext { SymbolicPredicate predicate; diff --git a/paddle/cinn/ir/group_schedule/search/config_searcher.cc b/paddle/cinn/ir/group_schedule/search/config_searcher.cc index 5dffb8a78cd5a..3e620d616762f 100644 --- a/paddle/cinn/ir/group_schedule/search/config_searcher.cc +++ b/paddle/cinn/ir/group_schedule/search/config_searcher.cc @@ -25,18 +25,18 @@ namespace search { WeightedSamplingTrailObjectiveFunc::WeightedSamplingTrailObjectiveFunc( ::pir::Program* program, - const IterSpace& iter_space, + const BucketInfo& bucket_info, double sampling_prob, int max_sampling_times, int repeats) : program_(program), - iter_space_(iter_space), + bucket_info_(bucket_info), measurer_(program), sampling_prob_(sampling_prob), max_sampling_times_(max_sampling_times), repeats_(repeats) { double weighted_space_size = 1.0; - for (const auto& dim : iter_space_.space) { + for (const auto& dim : bucket_info_.space) { PADDLE_ENFORCE_EQ(dim.upper_bound - dim.lower_bound + 1, dim.weights.size(), ::common::errors::InvalidArgument( @@ -54,7 +54,7 @@ WeightedSamplingTrailObjectiveFunc::WeightedSamplingTrailObjectiveFunc( // Generate Sampling Inputs const auto Sample = [&]() -> std::vector { std::vector samples; - for (IterSpace::Dimension dim : iter_space_.space) { + for (BucketInfo::Dimension dim : bucket_info_.space) { int sampled = utils::SampleDiscreteFromDistribution(dim.weights, &rand_seed_); samples.push_back(static_cast(sampled) + dim.lower_bound); @@ -82,19 +82,15 @@ ScoreType WeightedSamplingTrailObjectiveFunc::operator()( auto tile_config_database = std::make_shared(); IterSpaceType iter_space_type = [&] { std::vector> res; - for (const auto& dim : iter_space_.space) { + for (const auto& dim : bucket_info_.space) { res.emplace_back(dim.iter_type, (dim.is_dynamic ? "dynamic" : "static")); } return res; }(); - BucketInfo bucket_info{iter_space_.space[0].lower_bound, - iter_space_.space[0].upper_bound, - iter_space_.space[1].lower_bound, - iter_space_.space[1].upper_bound}; ScheduleConfig::TileConfig config{ candidate[0], candidate[1], candidate[2], NoneReduceMethod()}; tile_config_database->AddConfig( - cinn::common::DefaultTarget(), iter_space_type, bucket_info, config); + cinn::common::DefaultTarget(), bucket_info_, config); auto& schedule_config_manager = ScheduleConfigManager::Instance(); schedule_config_manager.AddConfigDatabase("custom", tile_config_database); measurer_.Compile(); diff --git a/paddle/cinn/ir/group_schedule/search/config_searcher.h b/paddle/cinn/ir/group_schedule/search/config_searcher.h index 082417388e8a6..4b97547db6851 100644 --- a/paddle/cinn/ir/group_schedule/search/config_searcher.h +++ b/paddle/cinn/ir/group_schedule/search/config_searcher.h @@ -19,6 +19,7 @@ #include #include +#include "paddle/cinn/ir/group_schedule/config/group_tile_config.h" #include "paddle/cinn/ir/group_schedule/search/measurer.h" #include "paddle/cinn/utils/random_engine.h" #include "paddle/pir/include/core/program.h" @@ -39,7 +40,7 @@ class BaseObjectiveFunc { class WeightedSamplingTrailObjectiveFunc : public BaseObjectiveFunc { public: WeightedSamplingTrailObjectiveFunc(::pir::Program* program, - const IterSpace& iter_space, + const BucketInfo& bucket_info, double sampling_prob = 1.0, int max_sampling_times = 65536, int repeats = 10); @@ -48,7 +49,7 @@ class WeightedSamplingTrailObjectiveFunc : public BaseObjectiveFunc { private: ::pir::Program* program_; - IterSpace iter_space_; + BucketInfo bucket_info_; Measurer measurer_; double sampling_prob_; int max_sampling_times_; diff --git a/paddle/cinn/ir/group_schedule/search/measurer.cc b/paddle/cinn/ir/group_schedule/search/measurer.cc index 1934ebea16b36..ea2fa18dcadbb 100644 --- a/paddle/cinn/ir/group_schedule/search/measurer.cc +++ b/paddle/cinn/ir/group_schedule/search/measurer.cc @@ -35,17 +35,6 @@ namespace cinn { namespace ir { namespace search { -std::string IterSpace::ToString() const { - std::stringstream ss; - ss << "IterSpace: ["; - for (const auto& dim : space) { - ss << dim.iter_type << "(" << dim.lower_bound << " - " << dim.upper_bound - << "), "; - } - ss << "]"; - return ss.str(); -} - std::shared_ptr CreatePassManager() { pir::IrContext* ctx = pir::IrContext::Instance(); ctx->GetOrRegisterDialect(); diff --git a/paddle/cinn/ir/group_schedule/search/measurer.h b/paddle/cinn/ir/group_schedule/search/measurer.h index 76de4b6eb065b..4118c40558b55 100644 --- a/paddle/cinn/ir/group_schedule/search/measurer.h +++ b/paddle/cinn/ir/group_schedule/search/measurer.h @@ -30,19 +30,6 @@ namespace cinn { namespace ir { namespace search { -struct IterSpace { - struct Dimension { - int lower_bound; - int upper_bound; - std::string iter_type; - bool is_dynamic; - std::vector weights; - }; - std::vector space; - - std::string ToString() const; -}; - struct MeasureResult { ::common::TimeDuration compile_time; ::common::TimeDuration avg_kernel_execute_time; diff --git a/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc b/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc index a807699f330d2..942b522f05f0f 100644 --- a/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc +++ b/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc @@ -47,11 +47,26 @@ bool IsWarpReduce(const ScheduleConfig& config) { return std::visit(MatchWarpReduce, config.tile_config.reduce_method); } +bool UseReduceTile(const ScheduleConfig& config) { + const auto& raw_reduce_axis = config.base_info->raw_reduce_axis; + const auto raw_data_rank = config.base_info->raw_data_rank; + if (raw_reduce_axis.empty()) { + return false; + } + for (size_t i = 1; i < raw_reduce_axis.size(); i++) { + if (raw_reduce_axis[i] != raw_reduce_axis[i - 1] + 1) { + return false; + } + } + return raw_reduce_axis.back() + 1 == raw_data_rank; +} + class TileFirstGeneralTactic final : public ScheduleTactic { public: void Init(ScheduleContext* context) override; void Apply(ir::IRSchedule* sch, const std::string& block_id) override; + void ApplyReduceTile(ir::IRSchedule* sch, const std::string& block_id); std::string TacticName() const override { return "TileFirstGeneralTactic"; } @@ -98,6 +113,11 @@ void TileFirstGeneralTactic::Init(ScheduleContext* context) { void TileFirstGeneralTactic::Apply(ir::IRSchedule* sch, const std::string& block_id) { + if (UseReduceTile(context_->config)) { + VLOG(4) << "Using ApplyReduceTile"; + ApplyReduceTile(sch, block_id); + return; + } if (ir::IsReduceInitTensorName(block_id)) return; MergeReduceAxis(sch, block_id); VLOG(6) << "After MergeReduceAxis on block: [" << block_id @@ -136,6 +156,106 @@ void TileFirstGeneralTactic::Apply(ir::IRSchedule* sch, SetReduceType(sch, block_id); } +void TileFirstGeneralTactic::ApplyReduceTile(ir::IRSchedule* sch, + const std::string& block_id) { + if (ir::IsReduceInitTensorName(block_id)) return; + + const auto sp_thread = context_->config.tile_config.warp_num * 32 / + context_->config.tile_config.tree_reduce_num; + const auto sp_loop = context_->config.tile_config.spatial_inner_num; + const auto rd_thread = context_->config.tile_config.tree_reduce_num; + VLOG(4) << "ApplyReduceTile sp_thread=" << sp_thread; + VLOG(4) << "ApplyReduceTile sp_loop=" << sp_loop; + VLOG(4) << "ApplyReduceTile rd_thread=" << rd_thread; + VLOG(4) << "ApplyReduceTile vec_flatten_axis: " + << utils::Join(vec_flatten_axis_, ", "); + VLOG(4) << "ApplyReduceTile vec_reduce_axis: " + << utils::Join(vec_reduce_axis_, ", "); + + // Merge reduce axes + MergeReduceAxis(sch, block_id); + VLOG(4) << "After MergeReduceAxis on block: [" << block_id + << "], loop nest:\n" + << sch->GetModule().GetExprs().front(); + + // Merge spatial axes + MergeFlattenAxis(sch, block_id); + VLOG(4) << "After MergeFlattenAxis on block: [" << block_id + << "], loop nest:\n" + << sch->GetModule().GetExprs().front(); + + // Split spatial axes -> [sp_block, sp_loop, sp_thread] + int current_reduce_axis = 0; + if (vec_flatten_axis_.size() > 0) { + auto loops = sch->GetLoops(block_id); + if (sp_loop > 1 && sp_thread > 1) { + sch->Split(loops[0], {-1, sp_loop, sp_thread}); + current_reduce_axis = 3; + } else if (sp_loop > 1 || sp_thread > 1) { + sch->Split(loops[0], {-1, sp_loop > 1 ? sp_loop : sp_thread}); + current_reduce_axis = 2; + } else { + current_reduce_axis = 1; + } + } + VLOG(4) << "After SplitSptial on block: [" << block_id << "], loop nest:\n" + << sch->GetModule().GetExprs().front(); + + // Split reduce axes -> [rd_loop, rd_thread] + if (vec_reduce_axis_.size() > 0) { + auto loops = sch->GetLoops(block_id); + auto reduce_loop = loops[current_reduce_axis].As(); + sch->Split(loops[current_reduce_axis], {-1, rd_thread}); + VLOG(4) << "Before ReorderReduction on block: [" << block_id + << "], loop nest:\n" + << sch->GetModule().GetExprs().front(); + + // TODO(lshpku): the Reorder is unneeded if the later FactorizeReduction + // supports rf_axis=1. + loops = sch->GetLoops(block_id); + sch->Reorder({loops[current_reduce_axis + 1], loops[current_reduce_axis]}); + VLOG(4) << "Before FactorizeReduction on block: [" << block_id + << "], loop nest:\n" + << sch->GetModule().GetExprs().front(); + + if (IsReduceBlock(context_->config, block_id)) { + loops = sch->GetLoops(block_id); + sch->FactorizeReduction(loops[current_reduce_axis], + /* rf_axis = */ 0, + /* with_write_back_block_init = */ false); + } + } + VLOG(4) << "After SplitReduce on block: [" << block_id << "], loop nest:\n" + << sch->GetModule().GetExprs().front(); + + // Bind CUDA info + const auto DoBind = [&](const std::vector& loops) { + std::string sp_axis_type = "threadIdx.y"; + std::string rd_axis_type = "threadIdx.x"; + sch->Bind(loops[0], "blockIdx.x"); + if (!vec_flatten_axis_.empty() && sp_thread > 1) { + if (vec_reduce_axis_.empty()) { + sch->Bind(loops[current_reduce_axis - 1], rd_axis_type); + } else { + sch->Bind(loops[current_reduce_axis - 1], sp_axis_type); + } + } + if (!vec_reduce_axis_.empty() && current_reduce_axis > 0) { + sch->Bind(loops[current_reduce_axis], rd_axis_type); + } + }; + DoBind(sch->GetLoops(block_id)); + if (IsReduceBlock(context_->config, block_id) && + sch->HasBlock(block_id + "_rf")) { + DoBind(sch->GetLoops(block_id + "_rf")); + } + VLOG(4) << "After BindCudaInfo on block: [" << block_id << "], loop nest:\n" + << sch->GetModule().GetExprs().front(); + + VariableTypeAssignment(sch, block_id); + SetReduceType(sch, block_id); +} + void TileFirstGeneralTactic::MergeFlattenAxis(ir::IRSchedule* sch, const std::string& block_id) { if (vec_flatten_axis_.size() >= 2) { diff --git a/paddle/cinn/ir/group_schedule/tactic/tile_tactic.cc b/paddle/cinn/ir/group_schedule/tactic/tile_tactic.cc index 0aaf620874568..adf979c7a7fd4 100644 --- a/paddle/cinn/ir/group_schedule/tactic/tile_tactic.cc +++ b/paddle/cinn/ir/group_schedule/tactic/tile_tactic.cc @@ -52,7 +52,14 @@ void TileTactic::Init(ScheduleContext* context) { int64_t extent = static_cast(total_rb_extent.get_constant()); nums_thread_per_block = GetFirstFactor(extent); } else { - nums_thread_per_block = context_->bucket_info.rb_lower_bound; + if (context->bucket_info.space.size() == 2 && + context->bucket_info.space[1].iter_type == "R") { + nums_thread_per_block = context_->bucket_info.space[1].lower_bound; + } else { + PADDLE_THROW(::common::errors::Unimplemented( + "Now, the function GetTreeReduceSize doesn't support the cases " + "except SR")); + } } return nums_thread_per_block > max_num_threads ? max_num_threads : nums_thread_per_block; @@ -95,9 +102,17 @@ void TileTactic::Init(ScheduleContext* context) { // other bound to cuda thread. context_->iter_space_info.sp_space.emplace_back( ir::Expr(-1), IterativeSpaceInfo::AxisType::kCudaBlockX); - context_->iter_space_info.sp_space.emplace_back( - ir::Expr(GetNumThreadPerBlock(context_->bucket_info.rb_upper_bound)), - IterativeSpaceInfo::AxisType::kCudaThreadX); + if (context->bucket_info.space.size() == 2 && + context->bucket_info.space[1].iter_type == "R") { + context_->iter_space_info.sp_space.emplace_back( + ir::Expr( + GetNumThreadPerBlock(context_->bucket_info.space[1].upper_bound)), + IterativeSpaceInfo::AxisType::kCudaThreadX); + } else { + PADDLE_THROW(::common::errors::Unimplemented( + "Now, the function GetTreeReduceSize doesn't support the cases " + "except SR")); + } } VLOG(6) << context_->iter_space_info.PrintIterSpace(); } diff --git a/paddle/cinn/ir/ir.cc b/paddle/cinn/ir/ir.cc index 1b9c83913112d..6d658ed30cc27 100644 --- a/paddle/cinn/ir/ir.cc +++ b/paddle/cinn/ir/ir.cc @@ -26,6 +26,7 @@ #include "paddle/cinn/ir/module.h" #include "paddle/cinn/ir/tensor.h" #include "paddle/cinn/optim/ir_simplify.h" +#include "paddle/common/errors.h" namespace cinn { namespace ir { @@ -255,6 +256,7 @@ Expr For::Make(Var loop_var, Expr body, VectorizeInfo vector_info, BindInfo bind_info) { + ir::TryElevateInt32ToInt64({loop_var, min, extent}); auto node = make_shared(); CHECK(loop_var.defined()); CHECK(min.defined()); @@ -884,9 +886,21 @@ void For::Verify() const { CHECK(extent.defined()); CHECK(body.defined()); - CHECK_EQ(loop_var->type(), type_of()); - CHECK_EQ(min->type(), type_of()); - CHECK_EQ(extent->type(), type_of()); + PADDLE_ENFORCE_EQ((loop_var->type() == type_of()) || + (loop_var->type() == type_of()), + true, + ::common::errors::InvalidArgument( + "loop var's type must be int32 or int64")); + PADDLE_ENFORCE_EQ((min->type() == type_of()) || + (min->type() == type_of()), + true, + ::common::errors::InvalidArgument( + "loop min's type must be int32 or int64")); + PADDLE_ENFORCE_EQ((extent->type() == type_of()) || + (extent->type() == type_of()), + true, + ::common::errors::InvalidArgument( + "loop extent's type must be int32 or int64")); } void PolyFor::Verify() const { diff --git a/paddle/cinn/ir/ir_base.h b/paddle/cinn/ir/ir_base.h index eeba03a0978ea..84e14cc839c15 100644 --- a/paddle/cinn/ir/ir_base.h +++ b/paddle/cinn/ir/ir_base.h @@ -402,6 +402,11 @@ struct UnaryOpNode : public ExprNode { return v().type(); } + void replace(Expr old_op, Expr new_op) { + if (v() == old_op) { + v() = new_op; + } + } Expr& v() { return operands().front(); } const Expr& v() const { return operands().front(); } diff --git a/paddle/cinn/ir/schedule/impl/base.cc b/paddle/cinn/ir/schedule/impl/base.cc index 24583a67374e7..e68a5396578b0 100644 --- a/paddle/cinn/ir/schedule/impl/base.cc +++ b/paddle/cinn/ir/schedule/impl/base.cc @@ -92,7 +92,7 @@ void DyScheduleImpl::MergeExprs() { } } for (auto& block : merged_block) { - VLOG(3) << "in merged_block, it has " << block; + VLOG(3) << "in merged_block, it has \n" << block; } auto merged_expr = ir::Block::Make(merged_block); exprs[0] diff --git a/paddle/cinn/ir/schedule/schedule_desc.proto b/paddle/cinn/ir/schedule/schedule_desc.proto index 829478cf22dd4..ed6d8bef92dbb 100644 --- a/paddle/cinn/ir/schedule/schedule_desc.proto +++ b/paddle/cinn/ir/schedule/schedule_desc.proto @@ -1,11 +1,11 @@ // Copyright (c) 2022 CINN Authors. All Rights Reserved. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at -// +// // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. diff --git a/paddle/cinn/optim/eliminate_common_factor_of_local_index.cc b/paddle/cinn/optim/eliminate_common_factor_of_local_index.cc index 85f8153bb65d4..362e6bff8a113 100644 --- a/paddle/cinn/optim/eliminate_common_factor_of_local_index.cc +++ b/paddle/cinn/optim/eliminate_common_factor_of_local_index.cc @@ -357,10 +357,10 @@ void EliminateCommonFactorHelper(ir::Expr* expr) { } void EliminateCommonFactorOfLocalIndex(ir::Expr* expr) { - VLOG(2) << "Before EliminateCommonFactorOfLocalIndex, Expr = \n" << *expr; + VLOG(4) << "Before EliminateCommonFactorOfLocalIndex, Expr = \n" << *expr; EliminateCommonFactorHelper(expr); EliminateCommonFactorHelper(expr); - VLOG(2) << "After EliminateCommonFactorOfLocalIndex, Expr = \n" << *expr; + VLOG(4) << "After EliminateCommonFactorOfLocalIndex, Expr = \n" << *expr; } } // namespace optim diff --git a/paddle/cinn/optim/resize_buffer.cc b/paddle/cinn/optim/resize_buffer.cc index 2ec4e172b3fc7..0d7ecbbca1b15 100644 --- a/paddle/cinn/optim/resize_buffer.cc +++ b/paddle/cinn/optim/resize_buffer.cc @@ -249,6 +249,7 @@ class ResizeBufferFromAnalyzedRange : public ir::IRMutator<> { ir::Store* store = expr->As(); ir::Tensor tensor = store->tensor.as_tensor_ref(); ResizeTensor(&tensor); + ReplaceTensorIndices(store); ir::IRMutator<>::Visit(op, expr); } @@ -277,6 +278,7 @@ class ResizeBufferFromAnalyzedRange : public ir::IRMutator<> { for (int i = 0; i < cnt; i++) { load->indices.erase(load->indices.begin()); } + ReplaceTensorIndices(load); ir::IRMutator<>::Visit(op, expr); } @@ -304,6 +306,35 @@ class ResizeBufferFromAnalyzedRange : public ir::IRMutator<> { } } + template + void ReplaceTensorIndices(T* op) { + ir::Tensor tensor = op->tensor.as_tensor_ref(); + ir::Buffer buffer = tensor->buffer; + if (!buffer.defined()) return; + if (buffer->memory_type != ir::MemoryType::GPULocal) return; + + VLOG(4) << "replacing index of tensor: " << tensor->name; + ir::Expr index_expr = op->index(); + std::unordered_map var_name_to_expr; + ir::ir_utils::CollectIRNodes(index_expr, [&](const ir::Expr* x) { + const ir::_Var_* var = x->as_var(); + if (var) { + var_name_to_expr[var->name] = var->Copy(); + } + return false; + }); + if (var_name_to_expr.size() != 1) { + return; + } + + ir::Expr single_var = var_name_to_expr.begin()->second; + VLOG(4) << "found single var: " << single_var; + for (size_t i = 0; i + 1 < op->indices.size(); i++) { + op->indices[i] = ir::Expr(0); + } + op->indices.back() = single_var; + } + private: const std::unordered_map>& buffer_name_to_shape_; diff --git a/paddle/cinn/optim/transform_gpu_forloop.cc b/paddle/cinn/optim/transform_gpu_forloop.cc index 4e5d5f4c5ae8e..5d4629436d7e6 100644 --- a/paddle/cinn/optim/transform_gpu_forloop.cc +++ b/paddle/cinn/optim/transform_gpu_forloop.cc @@ -426,7 +426,7 @@ class ReplaceVarToZero : public ir::IRMutator<> { }; void OptimizeExprGPU(Expr *expr) { - VLOG(2) << "Before Optimize Expr:\n" << *expr; + VLOG(4) << "Before Optimize Expr:\n" << *expr; // copy var nodes to prevent one modification leading to multiple changes RestructureVarNodes restructure_var_nodes; @@ -458,7 +458,7 @@ void OptimizeExprGPU(Expr *expr) { ReplaceVarToZero replace_var_to_zero; replace_var_to_zero(expr); - VLOG(2) << "After Optimize Expr: \n" << *expr; + VLOG(4) << "After Optimize Expr: \n" << *expr; } } // namespace optim diff --git a/paddle/cinn/pybind/backends.cc b/paddle/cinn/pybind/backends.cc index 4e589380223df..a0f51bc88aad8 100644 --- a/paddle/cinn/pybind/backends.cc +++ b/paddle/cinn/pybind/backends.cc @@ -61,7 +61,10 @@ void BindExecutionEngine(py::module *m) { &ExecutionEngine::Create)), py::arg("options") = ExecutionOptions()) .def("lookup", lookup) - .def("link", &ExecutionEngine::Link); + .def("link", + &ExecutionEngine::Link, + py::arg("module"), + py::arg("add_module") = true); { auto lookup = [](Compiler &self, absl::string_view name) { diff --git a/paddle/fluid/distributed/fleet_executor/task_loop.cc b/paddle/fluid/distributed/fleet_executor/task_loop.cc index 270bce7786038..44e853a0d9684 100644 --- a/paddle/fluid/distributed/fleet_executor/task_loop.cc +++ b/paddle/fluid/distributed/fleet_executor/task_loop.cc @@ -17,8 +17,7 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/errors.h" -namespace paddle { -namespace distributed { +namespace paddle::distributed { thread_local TaskLoop* TaskLoop::thread_local_loop_ = nullptr; @@ -81,5 +80,4 @@ void TaskLoop::AbortNotInLoopThread() { std::this_thread::get_id())); } -} // namespace distributed -} // namespace paddle +} // namespace paddle::distributed diff --git a/paddle/fluid/distributed/ps.proto b/paddle/fluid/distributed/ps.proto index 27a93a9787ff5..e7e708a2ee4f9 100644 --- a/paddle/fluid/distributed/ps.proto +++ b/paddle/fluid/distributed/ps.proto @@ -1,11 +1,11 @@ // Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at -// +// // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. diff --git a/paddle/fluid/distributed/ps/table/graph/graph_node.cc b/paddle/fluid/distributed/ps/table/graph/graph_node.cc index 31c098c49fba2..fa8fa61a23eab 100644 --- a/paddle/fluid/distributed/ps/table/graph/graph_node.cc +++ b/paddle/fluid/distributed/ps/table/graph/graph_node.cc @@ -15,8 +15,7 @@ #include "paddle/fluid/distributed/ps/table/graph/graph_node.h" #include -namespace paddle { -namespace distributed { +namespace paddle::distributed { GraphNode::~GraphNode() { if (sampler != nullptr) { @@ -122,5 +121,4 @@ void FeatureNode::recover_from_buffer(char* buffer) { feature.push_back(str); // NOLINT } } -} // namespace distributed -} // namespace paddle +} // namespace paddle::distributed diff --git a/paddle/fluid/distributed/rpc/rpc.proto b/paddle/fluid/distributed/rpc/rpc.proto index 2da9e37ae88d9..d9bd22aa974fc 100644 --- a/paddle/fluid/distributed/rpc/rpc.proto +++ b/paddle/fluid/distributed/rpc/rpc.proto @@ -1,11 +1,11 @@ // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at -// +// // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h index 853a0c445797c..247651ae149f5 100644 --- a/paddle/fluid/eager/to_static/run_program_op_node.h +++ b/paddle/fluid/eager/to_static/run_program_op_node.h @@ -18,6 +18,7 @@ #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/tensor_wrapper.h" #include "paddle/fluid/framework/executor_cache.h" +#include "paddle/fluid/framework/feed_hook.h" #include "paddle/fluid/framework/new_executor/interpretercore.h" #include "paddle/fluid/framework/tensor_ref_array.h" #include "paddle/fluid/framework/variable_helper.h" @@ -583,6 +584,7 @@ inline void PirRunProgramAPI( //} } + paddle::framework::RunFeedHooks(*forward_program, *global_inner_scope); // interpretercore run if (!forward_program->block()->empty()) { paddle::platform::RecordEvent record_event( @@ -869,7 +871,6 @@ inline void RunProgramGradAPI( auto *backward_global_block = PADDLE_GET_CONST( paddle::framework::BlockDesc *, attrs.at("backward_global_block")); auto *backward_program = backward_global_block->Program(); - details::Trans2ContiguousTensorsInplace(out_grad); auto out_grad_names = details::GetTensorsName(out_grad); @@ -1155,6 +1156,7 @@ inline void PirRunProgramGradAPI( } } + paddle::framework::RunFeedHooks(*backward_program, *global_inner_scope); if (!backward_program->block()->empty()) { paddle::platform::RecordEvent record_event( "interpreter_core_run", diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 041339fe597c3..c8f3dc0d673f1 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -515,6 +515,12 @@ cc_library( feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog) + +cc_library( + feed_hook + SRCS feed_hook.cc + DEPS lod_tensor scope glog pir) + cc_library( variable_helper SRCS variable_helper.cc @@ -529,6 +535,7 @@ set(NAIVE_EXECUTOR_DEPS glog lod_rank_table feed_fetch_method + feed_hook graph_to_program_pass standalone_executor variable_helper) @@ -598,6 +605,7 @@ if(WITH_DISTRIBUTE) lodtensor_printer lod_rank_table feed_fetch_method + feed_hook collective_helper ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass @@ -628,7 +636,7 @@ if(WITH_DISTRIBUTE) # pull_dense_worker.cc section_worker.cc heter_section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry # device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog # index_sampler index_wrapper sampler index_dataset_proto - # lod_rank_table framework_io fleet_wrapper heter_wrapper box_wrapper metrics lodtensor_printer feed_fetch_method + # lod_rank_table framework_io fleet_wrapper heter_wrapper box_wrapper metrics lodtensor_printer feed_fetch_method feed_hook # graph_to_program_pass variable_helper timer monitor # heter_service_proto fleet heter_server brpc fleet_executor # graph_gpu_wrapper) @@ -677,6 +685,7 @@ if(WITH_DISTRIBUTE) metrics lodtensor_printer feed_fetch_method + feed_hook graph_to_program_pass variable_helper timer @@ -750,6 +759,7 @@ if(WITH_DISTRIBUTE) metrics lodtensor_printer feed_fetch_method + feed_hook graph_to_program_pass variable_helper timer @@ -808,6 +818,7 @@ elseif(WITH_PSLIB) box_wrapper lodtensor_printer feed_fetch_method + feed_hook graph_to_program_pass variable_helper timer @@ -854,6 +865,7 @@ else() box_wrapper lodtensor_printer feed_fetch_method + feed_hook graph_to_program_pass variable_helper timer diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 4c78b12fd4ac4..5e4edb1ca2870 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -24,9 +24,7 @@ COMMON_DECLARE_bool(sync_nccl_allreduce); #endif -namespace paddle { -namespace framework { -namespace details { +namespace paddle::framework::details { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, @@ -335,6 +333,4 @@ void AllReduceOpHandle::SyncNCCLAllReduce() { #endif std::string AllReduceOpHandle::Name() const { return "all_reduce"; } -} // namespace details -} // namespace framework -} // namespace paddle +} // namespace paddle::framework::details diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index 4dbff851f00e2..b8db1e321257b 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -21,15 +21,11 @@ #endif #include -namespace paddle { -namespace framework { +namespace paddle::framework { class Variable; -} // namespace framework -} // namespace paddle +} // namespace paddle::framework -namespace paddle { -namespace framework { -namespace details { +namespace paddle::framework::details { EagerDeletionOpHandle::EagerDeletionOpHandle( ir::Node *node, @@ -213,6 +209,4 @@ std::vector EagerDeletionOpHandle::VarsToDelete() const { return var_names; } -} // namespace details -} // namespace framework -} // namespace paddle +} // namespace paddle::framework::details diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index fe43126ca8abe..05e1693eb650e 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -26,9 +26,7 @@ PADDLE_DEFINE_EXPORTED_bool( false, "Whether to make the result of computation deterministic in CPU side."); -namespace paddle { -namespace framework { -namespace details { +namespace paddle::framework::details { std::once_flag CollectiveContext::init_flag_; std::unique_ptr CollectiveContext::context_; @@ -318,6 +316,4 @@ std::vector ReduceOpHandle::GetInputValues( } std::string ReduceOpHandle::Name() const { return "reduce"; } -} // namespace details -} // namespace framework -} // namespace paddle +} // namespace paddle::framework::details diff --git a/paddle/fluid/framework/feed_hook.cc b/paddle/fluid/framework/feed_hook.cc new file mode 100644 index 0000000000000..b2322839c6d03 --- /dev/null +++ b/paddle/fluid/framework/feed_hook.cc @@ -0,0 +1,130 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/feed_hook.h" +#include +#include +#include "paddle/common/flags.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h" +#include "paddle/pir/include/core/program.h" + +COMMON_DECLARE_string(logging_pir_py_code_dir); +COMMON_DECLARE_bool(logging_trunc_pir_py_code); + +namespace paddle::framework { + +namespace { + +std::optional GetLoggingFilePath() { + if (FLAGS_logging_pir_py_code_dir.empty()) return std::nullopt; + const std::string file_path = + FLAGS_logging_pir_py_code_dir + "/programs_example_input_tensor_meta.py"; + return file_path; +} + +void TryTruncateLoggingFile() { + if (!FLAGS_logging_trunc_pir_py_code) return; + std::optional file_path = GetLoggingFilePath(); + if (!file_path.has_value()) return; + static std::once_flag once_flag; + std::call_once(once_flag, [&] { + std::ofstream ofs; + ofs.open(file_path.value().c_str(), std::ios::out | std::ios::trunc); + ofs.close(); + }); +} + +template +void VisitFeedName(const pir::Program& program, + const DoEachFeadNameT& DoEachFeadName) { + auto module_op = program.module_op(); + const auto& block = module_op.block(); + const auto& IsDataOp = [](const pir::Operation& op) -> bool { + return op.isa(); + }; + const auto& GetDataOpName = [](const pir::Operation& op) -> std::string { + return op.attributes().at("name").dyn_cast().AsString(); + }; + for (const auto& op : block) { + if (IsDataOp(op)) { + DoEachFeadName(GetDataOpName(op)); + } + } + for (const auto& [name, _] : block.kwargs()) { + DoEachFeadName(name); + } +} + +std::string GetLoggingShapeOrDataForName(int64_t program_id, + const std::string& name, + const phi::DenseTensor& tensor) { + std::ostringstream ss; + ss << "class PirProgram_example_input_tensor_meta_" << program_id << ":"; + ss << "\n\tprogram_id = " << program_id; + ss << "\n\tinput_name = " << std::quoted(name); + ss << "\n\tshape = ["; + int i = 0; + for (int dim : ::common::vectorize(tensor.dims())) { + if (i++ > 0) { + ss << ", "; + } + ss << dim; + } + ss << "]"; + ss << "\n\n"; + return ss.str(); +} + +void AppendToLoggingFile(const std::string& logging_str) { + std::optional file_path = GetLoggingFilePath(); + if (!file_path.has_value()) return; + std::ofstream ofs; + ofs.open(file_path.value().c_str(), std::ios::out | std::ios::app); + if (!ofs.is_open()) return; + ofs << logging_str << std::endl; + ofs.close(); +} + +void AppendLoggingShapeOrDataForName(int64_t uid, + const std::string& name, + const phi::DenseTensor& tensor) { + static std::mutex mutex; + std::unique_lock lock(mutex); + using Name2OnceFlag = std::unordered_map; + static std::unordered_map once_flags; + std::call_once(once_flags[uid][name], [&] { + AppendToLoggingFile(GetLoggingShapeOrDataForName(uid, name, tensor)); + }); +} + +void SaveLoggingShapeOrData(const pir::Program& program, const Scope& scope) { + if (FLAGS_logging_pir_py_code_dir.empty()) return; + TryTruncateLoggingFile(); + VisitFeedName(program, [&](const std::string& name) { + Variable* variable = scope.FindVar(name); + if (variable == nullptr) return; + if (!variable->IsType()) return; + const phi::DenseTensor& tensor = variable->Get(); + AppendLoggingShapeOrDataForName(program.id(), name, tensor); + }); +} + +} // namespace + +void RunFeedHooks(const pir::Program& program, const Scope& scope) { + SaveLoggingShapeOrData(program, scope); +} + +} // namespace paddle::framework diff --git a/paddle/fluid/pybind/parallel_executor.h b/paddle/fluid/framework/feed_hook.h similarity index 70% rename from paddle/fluid/pybind/parallel_executor.h rename to paddle/fluid/framework/feed_hook.h index 3c3acace033a7..3a8584e3899b6 100644 --- a/paddle/fluid/pybind/parallel_executor.h +++ b/paddle/fluid/framework/feed_hook.h @@ -1,4 +1,4 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,12 +14,16 @@ #pragma once -#include "pybind11/pybind11.h" +namespace pir { -namespace paddle { -namespace pybind { +class Program; -void BindParallelExecutor(pybind11::module& m); // NOLINT +} -} // namespace pybind -} // namespace paddle +namespace paddle::framework { + +class Scope; + +void RunFeedHooks(const pir::Program& program, const Scope& scope); + +} // namespace paddle::framework diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index b3ff3ac35d96d..a5f1d3bea2e7d 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -334,6 +334,8 @@ if(WITH_XPU) DEPS ${XPU_PASS_DEPS}) pass_library(weight_only_linear_xpu_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) + pass_library(block_multihead_attention_xpu_pass inference DIR xpu DEPS + ${XPU_PASS_DEPS}) endif() cc_library( diff --git a/paddle/fluid/framework/ir/delete_cast_op_pass_test.cc b/paddle/fluid/framework/ir/delete_cast_op_pass_test.cc index 17f0c642a60d1..c5480db1ca466 100644 --- a/paddle/fluid/framework/ir/delete_cast_op_pass_test.cc +++ b/paddle/fluid/framework/ir/delete_cast_op_pass_test.cc @@ -16,9 +16,7 @@ #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" -namespace paddle { -namespace framework { -namespace ir { +namespace paddle::framework::ir { void AddVarToScope(Scope* param_scope, const std::string& name, @@ -315,8 +313,6 @@ TEST(ApplyCastPass, basic) { cast_num_in_graph)); } -} // namespace ir -} // namespace framework -} // namespace paddle +} // namespace paddle::framework::ir USE_PASS(delete_cast_op_pass); diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator.cc b/paddle/fluid/framework/ir/fusion_group/code_generator.cc index 2e5c2b5be4ac3..defc320495064 100644 --- a/paddle/fluid/framework/ir/fusion_group/code_generator.cc +++ b/paddle/fluid/framework/ir/fusion_group/code_generator.cc @@ -17,10 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/fusion_group/code_generator_helper.h" #include "paddle/fluid/framework/ir/fusion_group/cuda_resources.h" -namespace paddle { -namespace framework { -namespace ir { -namespace fusion_group { +namespace paddle::framework::ir::fusion_group { std::string ExtractDataType(const std::vector& nodes) { std::string dtype_str = ""; @@ -373,7 +370,4 @@ std::unordered_map CodeGenerator::EncodeVarNodes( return var_ids; } -} // namespace fusion_group -} // namespace ir -} // namespace framework -} // namespace paddle +} // namespace paddle::framework::ir::fusion_group diff --git a/paddle/fluid/framework/ir/merge_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/merge_layernorm_fuse_pass.cc index 2e6aaa37808ae..1fbe22ff33021 100644 --- a/paddle/fluid/framework/ir/merge_layernorm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/merge_layernorm_fuse_pass.cc @@ -39,9 +39,7 @@ GET_IR_NODE(layernorm_40_in_bias); \ GET_IR_NODE(layernorm_40_in_scale); \ GET_IR_NODE(layernorm_40_out); -namespace paddle { -namespace framework { -namespace ir { +namespace paddle::framework::ir { MergeLayernormFusePass::MergeLayernormFusePass() { AddOpCompat(OpCompat("reshape2")) .AddInput("X") @@ -176,9 +174,7 @@ void MergeLayernormFusePass::ApplyImpl(ir::Graph* graph) const { gpd(graph, handler); AddStatis(fusion_count); } -} // namespace ir -} // namespace framework -} // namespace paddle +} // namespace paddle::framework::ir REGISTER_PASS(merge_layernorm_fuse_pass, paddle::framework::ir::MergeLayernormFusePass); REGISTER_PASS_CAPABILITY(merge_layernorm_fuse_pass) diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/add_reader_dependency_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/add_reader_dependency_pass.cc index b907869b4a38e..e0b96b69116a4 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/add_reader_dependency_pass.cc +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/add_reader_dependency_pass.cc @@ -16,9 +16,7 @@ #include "paddle/fluid/framework/ir/pass.h" -namespace paddle { -namespace framework { -namespace ir { +namespace paddle::framework::ir { class Graph; @@ -106,9 +104,7 @@ void AddReaderDependencyPass::ApplyImpl(Graph *graph) const { } } -} // namespace ir -} // namespace framework -} // namespace paddle +} // namespace paddle::framework::ir REGISTER_PASS(add_reader_dependency_pass, paddle::framework::ir::AddReaderDependencyPass); diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.cc index f4f0e393c2499..72e8baaba5017 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.cc +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.cc @@ -17,9 +17,7 @@ #include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" -namespace paddle { -namespace framework { -namespace ir { +namespace paddle::framework::ir { static std::unordered_set ReaderOpSet() { return {"create_py_reader"}; @@ -78,6 +76,4 @@ void SetReaderOpDeviceInfo(Graph *graph, size_t dev_cnt, size_t dev_idx) { VLOG(10) << "Found op number " << found_op_num; } -} // namespace ir -} // namespace framework -} // namespace paddle +} // namespace paddle::framework::ir diff --git a/paddle/fluid/framework/ir/onednn/compute_propagate_scales_onednn_pass.cc b/paddle/fluid/framework/ir/onednn/compute_propagate_scales_onednn_pass.cc index 1c733636ca7b0..cf17f00fa4080 100644 --- a/paddle/fluid/framework/ir/onednn/compute_propagate_scales_onednn_pass.cc +++ b/paddle/fluid/framework/ir/onednn/compute_propagate_scales_onednn_pass.cc @@ -21,9 +21,7 @@ #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/op_version_registry.h" -namespace paddle { -namespace framework { -namespace ir { +namespace paddle::framework::ir { void ComputePropagateScalesMkldnnPass::GetTensorFromVector( const std::vector& data_v, phi::DenseTensor* tensor) const { @@ -516,9 +514,7 @@ void ComputePropagateScalesMkldnnPass::ApplyImpl(ir::Graph* graph) const { graph, "has_quant_info", "var_quant_scales", var_quant_scales); } -} // namespace ir -} // namespace framework -} // namespace paddle +} // namespace paddle::framework::ir REGISTER_PASS(compute_propagate_scales_onednn_pass, paddle::framework::ir::ComputePropagateScalesMkldnnPass); diff --git a/paddle/fluid/framework/ir/onednn/conv_elementwise_add_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/conv_elementwise_add_onednn_fuse_pass.cc index 7733730f7d605..14857f3c550d8 100644 --- a/paddle/fluid/framework/ir/onednn/conv_elementwise_add_onednn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/onednn/conv_elementwise_add_onednn_fuse_pass.cc @@ -19,9 +19,7 @@ #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/utils/string/pretty_log.h" -namespace paddle { -namespace framework { -namespace ir { +namespace paddle::framework::ir { ResidualConnectionMKLDNNFusePass::ResidualConnectionMKLDNNFusePass() { AddOpCompat(OpCompat("conv2d")) @@ -305,9 +303,7 @@ void ResidualConnectionMKLDNNFusePass::ApplyImpl(ir::Graph* graph) const { AddStatis(graph_with_stats.second); } -} // namespace ir -} // namespace framework -} // namespace paddle +} // namespace paddle::framework::ir REGISTER_PASS(conv_elementwise_add_onednn_fuse_pass, paddle::framework::ir::ResidualConnectionMKLDNNFusePass); diff --git a/paddle/fluid/framework/ir/onednn/operator_reshape2_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/operator_reshape2_onednn_fuse_pass.cc index a21ddd579be3c..f937a1c681b17 100644 --- a/paddle/fluid/framework/ir/onednn/operator_reshape2_onednn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/onednn/operator_reshape2_onednn_fuse_pass.cc @@ -19,9 +19,7 @@ #include "paddle/phi/backends/onednn/onednn_reuse.h" #include "paddle/utils/string/pretty_log.h" -namespace paddle { -namespace framework { -namespace ir { +namespace paddle::framework::ir { using string::PrettyLogDetail; @@ -132,9 +130,7 @@ void FuseOperatorReshape2OneDNNPass::FuseReshape2(Graph *graph, op_type); } -} // namespace ir -} // namespace framework -} // namespace paddle +} // namespace paddle::framework::ir REGISTER_PASS(operator_reshape2_onednn_fuse_pass, paddle::framework::ir::FuseOperatorReshape2OneDNNPass); diff --git a/paddle/fluid/framework/ir/onednn/squeeze2_transpose2_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/onednn/squeeze2_transpose2_onednn_fuse_pass.cc index 4af9c6a770436..7ac8edbb6005c 100644 --- a/paddle/fluid/framework/ir/onednn/squeeze2_transpose2_onednn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/onednn/squeeze2_transpose2_onednn_fuse_pass.cc @@ -17,9 +17,7 @@ #include "paddle/phi/backends/onednn/onednn_reuse.h" #include "paddle/utils/string/pretty_log.h" -namespace paddle { -namespace framework { -namespace ir { +namespace paddle::framework::ir { using string::PrettyLogDetail; @@ -77,9 +75,7 @@ void FuseSqueeze2Transpose2OneDNNPass::ApplyImpl(Graph *graph) const { } } -} // namespace ir -} // namespace framework -} // namespace paddle +} // namespace paddle::framework::ir REGISTER_PASS(squeeze2_transpose2_onednn_fuse_pass, paddle::framework::ir::FuseSqueeze2Transpose2OneDNNPass); diff --git a/paddle/fluid/framework/ir/placement_pass_base.cc b/paddle/fluid/framework/ir/placement_pass_base.cc index ccf2bf22ab57b..718e15b01fd72 100644 --- a/paddle/fluid/framework/ir/placement_pass_base.cc +++ b/paddle/fluid/framework/ir/placement_pass_base.cc @@ -18,9 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" -namespace paddle { -namespace framework { -namespace ir { +namespace paddle::framework::ir { void PlacementPassBase::ApplyImpl(ir::Graph* graph) const { VLOG(3) << "Applies " << GetPlacementName() << " placement strategy."; @@ -43,6 +41,4 @@ void PlacementPassBase::ApplyImpl(ir::Graph* graph) const { } } -} // namespace ir -} // namespace framework -} // namespace paddle +} // namespace paddle::framework::ir diff --git a/paddle/fluid/framework/ir/preln_elementwise_groupnorm_act_pass.cc b/paddle/fluid/framework/ir/preln_elementwise_groupnorm_act_pass.cc index 7cbb5c169f63c..3917423754ba4 100644 --- a/paddle/fluid/framework/ir/preln_elementwise_groupnorm_act_pass.cc +++ b/paddle/fluid/framework/ir/preln_elementwise_groupnorm_act_pass.cc @@ -18,18 +18,11 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/op_version_registry.h" -namespace paddle { -namespace framework { -namespace ir { +namespace paddle::framework::ir { class Node; -} // namespace ir -} // namespace framework -} // namespace paddle +} // namespace paddle::framework::ir -namespace paddle { -namespace framework { -namespace ir { -namespace patterns { +namespace paddle::framework::ir::patterns { struct PrelnGroupNormAct : public PatternBase { PrelnGroupNormAct(PDPattern *pattern, const std::string &name_scope) @@ -92,7 +85,8 @@ void PrelnGroupNormAct::operator()(PDNode *x, PDNode *y, bool with_act) { } } -} // namespace patterns +} // namespace paddle::framework::ir::patterns +namespace paddle::framework::ir { int PrelnGroupNormActFusePass::ApplyAddGNPattern(ir::Graph *graph, bool with_act) const { @@ -203,9 +197,7 @@ void PrelnGroupNormActFusePass::ApplyImpl(ir::Graph *graph) const { AddStatis(found_subgraph_count); } -} // namespace ir -} // namespace framework -} // namespace paddle +} // namespace paddle::framework::ir REGISTER_PASS(preln_elementwise_groupnorm_act_pass, paddle::framework::ir::PrelnGroupNormActFusePass); diff --git a/paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.cc index d68694106b5c7..c6a22c143fb66 100644 --- a/paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.cc +++ b/paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.cc @@ -22,10 +22,7 @@ #endif #include "paddle/phi/kernels/funcs/blas/blas.h" -namespace paddle { -namespace framework { -namespace ir { -namespace patterns { +namespace paddle::framework::ir::patterns { // input_qk input_v // |q |k v @@ -249,7 +246,8 @@ PDNode* TrtQKMultiHeadMatmulPattern::operator()() { return reshape2_qkv_out_var; } -} // namespace patterns +} // namespace paddle::framework::ir::patterns +namespace paddle::framework::ir { int TrtQkMultiHeadMatmulFusePass::BuildQkFusion(Graph* graph, const std::string& name_scope, @@ -575,9 +573,7 @@ void TrtQkMultiHeadMatmulFusePass::ApplyImpl(Graph* graph) const { AddStatis(fusion_count); } -} // namespace ir -} // namespace framework -} // namespace paddle +} // namespace paddle::framework::ir REGISTER_PASS(trt_qk_multihead_matmul_fuse_pass, paddle::framework::ir::TrtQkMultiHeadMatmulFusePass); diff --git a/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc index 0708218dbd07c..e90cadc782a61 100644 --- a/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc @@ -22,18 +22,11 @@ limitations under the License. */ #include "paddle/fluid/inference/tensorrt/helper.h" #endif -namespace paddle { -namespace framework { -namespace ir { +namespace paddle::framework::ir { class Node; -} // namespace ir -} // namespace framework -} // namespace paddle +} // namespace paddle::framework::ir -namespace paddle { -namespace framework { -namespace ir { -namespace patterns { +namespace paddle::framework::ir::patterns { struct TrtSkipLayerNorm : public PatternBase { TrtSkipLayerNorm(PDPattern *pattern, const std::string &name_scope) @@ -102,7 +95,8 @@ PDNode *TrtSkipLayerNorm::operator()(PDNode *x, PDNode *y) { return layer_norm_out_var; } -} // namespace patterns +} // namespace paddle::framework::ir::patterns +namespace paddle::framework::ir { void TrtSkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const { PADDLE_ENFORCE_NOT_NULL( @@ -271,9 +265,7 @@ void TrtSkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const { AddStatis(found_subgraph_count); } -} // namespace ir -} // namespace framework -} // namespace paddle +} // namespace paddle::framework::ir REGISTER_PASS(trt_skip_layernorm_fuse_pass, paddle::framework::ir::TrtSkipLayerNormFusePass); diff --git a/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc b/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc index d9907555a17b5..6b49a99c02364 100644 --- a/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc +++ b/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc @@ -26,9 +26,7 @@ #include "paddle/fluid/framework/ir/node.h" #include "paddle/phi/common/data_type.h" -namespace paddle { -namespace framework { -namespace ir { +namespace paddle::framework::ir { namespace { @@ -383,8 +381,6 @@ void TrtSupportNHWCPass::ApplyImpl(Graph *graph) const { AddStatis(transposed_ops.size()); } -} // namespace ir -} // namespace framework -} // namespace paddle +} // namespace paddle::framework::ir REGISTER_PASS(trt_support_nhwc_pass, paddle::framework::ir::TrtSupportNHWCPass); diff --git a/paddle/fluid/framework/ir/xpu/block_multihead_attention_xpu_pass.cc b/paddle/fluid/framework/ir/xpu/block_multihead_attention_xpu_pass.cc new file mode 100644 index 0000000000000..3d4c78896f7e2 --- /dev/null +++ b/paddle/fluid/framework/ir/xpu/block_multihead_attention_xpu_pass.cc @@ -0,0 +1,125 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "glog/logging.h" + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/ir/xpu/pass_utils.h" +#include "paddle/fluid/framework/ir/xpu/quant_utils.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/platform/enforce.h" + +namespace phi { +class DenseTensor; +} // namespace phi + +namespace paddle { +namespace framework { +class Scope; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace framework { +namespace ir { + +class BlockMultiHeadAttentionXPUPass : public FusePassBase { + protected: + void ApplyImpl(ir::Graph* graph) const override; + + private: + void InplaceBlockMultiHeadAttentionXPU(ir::Graph* graph) const; + + const std::string name_scope_{"block_multihead_attention_xpu_pass"}; +}; + +void BlockMultiHeadAttentionXPUPass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::PreconditionNotMet("graph should not be null.")); + Init(name_scope_, graph); + + InplaceBlockMultiHeadAttentionXPU(graph); +} + +void BlockMultiHeadAttentionXPUPass::InplaceBlockMultiHeadAttentionXPU( + ir::Graph* graph) const { + const int64_t max_batch_size = 10; + auto* scope = param_scope(); + for (auto* node : graph->Nodes()) { + if (node->IsOp() && node->Op()->Type() == "block_multihead_attention") { + auto* op_desc = node->Op(); + op_desc->SetType("block_multihead_attention_xpu"); + phi::DenseTensor cache_k_per_batch_maxs; + auto base_name = op_desc->Input("qkv")[0]; + int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1); + std::string cache_k_per_batch_maxs_name = base_name + "_max_cache_k"; + VarDesc cache_k_per_batch_maxs_desc(cache_k_per_batch_maxs_name); + cache_k_per_batch_maxs_desc.SetPersistable(true); + cache_k_per_batch_maxs_desc.SetShape( + {max_batch_size, static_cast(max_ptr_size)}); + cache_k_per_batch_maxs_desc.SetDataType( + proto::VarType::Type::VarType_Type_FP32); + Node* cache_k_per_batch_maxs_in = + graph->CreateVarNode(&cache_k_per_batch_maxs_desc); + phi::DenseTensor cpu_tensor; + auto* cpu_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(phi::CPUPlace())); + cpu_tensor.set_type(phi::DataType::FLOAT32); + cpu_tensor.Resize({max_batch_size, max_ptr_size}); + std::vector tmp(max_batch_size * max_ptr_size, 0); + memcpy(cpu_ctx->Alloc(&cpu_tensor), + tmp.data(), + max_batch_size * max_ptr_size * sizeof(float)); + Assign(cpu_tensor, + scope->Var(cache_k_per_batch_maxs_name) + ->GetMutable()); + op_desc->SetInput("cache_k_per_batch_maxs", + {cache_k_per_batch_maxs_name}); + + std::string cache_v_per_batch_maxs_name = base_name + "_max_cache_v"; + VarDesc cache_v_per_batch_maxs_desc(cache_v_per_batch_maxs_name); + cache_v_per_batch_maxs_desc.SetPersistable(true); + cache_v_per_batch_maxs_desc.SetShape( + {max_batch_size, static_cast(max_ptr_size)}); + cache_v_per_batch_maxs_desc.SetDataType( + proto::VarType::Type::VarType_Type_FP32); + Node* cache_v_per_batch_maxs_in = + graph->CreateVarNode(&cache_v_per_batch_maxs_desc); + Assign(cpu_tensor, + scope->Var(cache_v_per_batch_maxs_name) + ->GetMutable()); + op_desc->SetInput("cache_v_per_batch_maxs", + {cache_v_per_batch_maxs_name}); + + IR_NODE_LINK_TO(cache_k_per_batch_maxs_in, node); + IR_NODE_LINK_TO(cache_v_per_batch_maxs_in, node); + } + } +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(block_multihead_attention_xpu_pass, + paddle::framework::ir::BlockMultiHeadAttentionXPUPass); + +REGISTER_PASS_CAPABILITY(block_multihead_attention_xpu_pass) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination().EQ( + "block_multihead_attention_xpu", 0)); diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_util.cc b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc index 609fd78106747..e7a05d75f6e99 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_util.cc +++ b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc @@ -42,8 +42,7 @@ COMMON_DECLARE_bool(dynamic_static_unified_comm); #endif -namespace paddle { -namespace framework { +namespace paddle::framework { std::vector GetValueIds(pir::Value value, const ValueExecutionInfo& value_exec_info) { @@ -407,5 +406,4 @@ bool GetCondData(const phi::DenseTensor& cond) { return cpu_cond->data()[0]; } -} // namespace framework -} // namespace paddle +} // namespace paddle::framework diff --git a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.cc b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.cc index 9af41b9e8c08b..b8a56321b9e66 100644 --- a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.cc @@ -405,9 +405,12 @@ OneDNNPhiKernelInstruction::~OneDNNPhiKernelInstruction() { } void OneDNNPhiKernelInstruction::Run() { + std::vector> tmp_holders; + auto tmp_kernel_context = kernel_context_; + auto tmp_infer_meta_context_ = infer_meta_context_; // Step1. TransLayout - auto inputs = kernel_context_.InputsBetween( - size_t(0), kernel_context_.InputsSize()); + auto inputs = tmp_kernel_context.InputsBetween( + size_t(0), tmp_kernel_context.InputsSize()); for (size_t i = 0; i < inputs.size(); ++i) { auto input = inputs[i]; if (input == nullptr) { @@ -419,10 +422,12 @@ void OneDNNPhiKernelInstruction::Run() { if (skip_format_tensors_.count(i)) { continue; } - VLOG(6) << "input[" << i << "].layout() = " << input->layout(); + VLOG(6) << "input[" << i << "].layout() = " << input->layout() + << ", shape = " << input->dims(); if (input->layout() != phi::DataLayout::ONEDNN) { phi::DataLayout from_layout = input->layout(); - auto transed_tensor = const_cast(input); + tmp_holders.emplace_back(std::make_shared(*input)); + auto transed_tensor = tmp_holders.back().get(); std::set elementwise_kernels = { "add", "subtract", "multiply", "divide"}; @@ -461,8 +466,24 @@ void OneDNNPhiKernelInstruction::Run() { } dnnl::memory::desc out_mem_desc = - phi::funcs::make_memory_desc(*input, from_layout); + phi::funcs::make_memory_desc(*transed_tensor, from_layout); transed_tensor->set_mem_desc(out_mem_desc); + tmp_kernel_context.UpdataInput(i, transed_tensor); + auto meta_tensor = phi::MetaTensor(transed_tensor); + auto input_meta_tensor = phi::MetaTensor(input); + if (tmp_infer_meta_context_.InputsSize() > i && + tmp_infer_meta_context_.InputAt(i).is_same_tensor( + input_meta_tensor)) { + tmp_infer_meta_context_.UpdataInput(i, meta_tensor); + } else { + for (size_t j = 0; j < tmp_infer_meta_context_.InputsSize(); ++j) { + if (tmp_infer_meta_context_.InputAt(j).is_same_tensor( + input_meta_tensor)) { + tmp_infer_meta_context_.UpdataInput(j, meta_tensor); + break; + } + } + } } } @@ -470,7 +491,7 @@ void OneDNNPhiKernelInstruction::Run() { // SetDnnAttrIntoDeviceContext // SetInputsName SetOutputsName auto one_dnn_ctx = const_cast( - &kernel_context_.GetDeviceContext()); + &tmp_kernel_context.GetDeviceContext()); for (auto& attr : extra_attr_) { one_dnn_ctx->SetDnnAttr(attr.first, attr.second); } @@ -482,12 +503,12 @@ void OneDNNPhiKernelInstruction::Run() { // Step3. InferMeta if (infer_meta_interface_) { - infer_meta_interface_->infer_meta_(&(infer_meta_context_)); + infer_meta_interface_->infer_meta_(&(tmp_infer_meta_context_)); } // Step4. Run kernel VLOG(6) << "Run op " << phi_op_name_ << " infer meta."; - (*(phi_kernel_))(&(kernel_context_)); + (*(phi_kernel_))(&(tmp_kernel_context)); VLOG(6) << "Run op " << phi_op_name_ << " kernel."; // Step5. ClearDnnAttr diff --git a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_mixed_instruction.cc b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_mixed_instruction.cc index 0115f2f4b9f31..3f72973e37a3e 100644 --- a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_mixed_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_mixed_instruction.cc @@ -58,6 +58,7 @@ OneDNNMixedPhiKernelInstruction::OneDNNMixedPhiKernelInstruction( } void OneDNNMixedPhiKernelInstruction::Run() { + std::vector> tmp_holders; // Step1. Mixed Dynamic Choose Kernel if (!has_choose_kernel_) { has_choose_kernel_ = true; @@ -76,9 +77,11 @@ void OneDNNMixedPhiKernelInstruction::Run() { if (use_onednn_kernel_) { OneDNNPhiKernelInstruction::Run(); } else { + auto tmp_kernel_context = kernel_context_; + auto tmp_infer_meta_context_ = infer_meta_context_; // TransLayout first - auto inputs = kernel_context_.InputsBetween( - size_t(0), kernel_context_.InputsSize()); + auto inputs = tmp_kernel_context.InputsBetween( + size_t(0), tmp_kernel_context.InputsSize()); for (size_t i = 0; i < inputs.size(); ++i) { auto input = inputs[i]; @@ -89,30 +92,66 @@ void OneDNNMixedPhiKernelInstruction::Run() { // NOTE(zhiqiu): to handle the special case in ApplyDataTransform() in // data_transfer.cc if (!input->IsInitialized() && tmp_layout == DataLayout::NHWC) { - auto transed_tensor = const_cast(input); + tmp_holders.emplace_back(std::make_shared(*input)); + auto transed_tensor = tmp_holders.back().get(); transed_tensor->set_layout(tmp_layout); phi::funcs::MatchShapeToLayout( transed_tensor, phi::DataLayout::ONEDNN, tmp_layout); + dnnl::memory::desc out_mem_desc = + phi::funcs::make_memory_desc(*transed_tensor, tmp_layout); + transed_tensor->set_mem_desc(out_mem_desc); + tmp_kernel_context.UpdataInput(i, transed_tensor); + auto meta_tensor = phi::MetaTensor(transed_tensor); + auto input_meta_tensor = phi::MetaTensor(input); + if (tmp_infer_meta_context_.InputsSize() > i && + tmp_infer_meta_context_.InputAt(i).is_same_tensor( + input_meta_tensor)) { + tmp_infer_meta_context_.UpdataInput(i, meta_tensor); + } else { + for (size_t j = 0; j < tmp_infer_meta_context_.InputsSize(); ++j) { + if (tmp_infer_meta_context_.InputAt(j).is_same_tensor( + input_meta_tensor)) { + tmp_infer_meta_context_.UpdataInput(j, meta_tensor); + break; + } + } + } } else { - phi::DenseTensor transed_tensor; - transed_tensor.set_meta(input->meta()); + tmp_holders.emplace_back(std::make_shared()); + auto transed_tensor = tmp_holders.back().get(); + transed_tensor->set_meta(input->meta()); phi::funcs::TransDataLayoutFromOneDNN(phi::DataLayout::ONEDNN, tmp_layout, *input, - &transed_tensor, + transed_tensor, phi::CPUPlace()); - *(const_cast(input)) = transed_tensor; + tmp_kernel_context.UpdataInput(i, transed_tensor); + auto meta_tensor = phi::MetaTensor(transed_tensor); + auto input_meta_tensor = phi::MetaTensor(input); + if (tmp_infer_meta_context_.InputsSize() > i && + tmp_infer_meta_context_.InputAt(i).is_same_tensor( + input_meta_tensor)) { + tmp_infer_meta_context_.UpdataInput(i, meta_tensor); + } else { + for (size_t j = 0; j < tmp_infer_meta_context_.InputsSize(); ++j) { + if (tmp_infer_meta_context_.InputAt(j).is_same_tensor( + input_meta_tensor)) { + tmp_infer_meta_context_.UpdataInput(j, meta_tensor); + break; + } + } + } } } } VLOG(6) << "Begin run op " << phi_op_name_ << " infer meta."; if (infer_meta_interface_) { - infer_meta_interface_->infer_meta_(&(infer_meta_context_)); + infer_meta_interface_->infer_meta_(&(tmp_infer_meta_context_)); } VLOG(6) << "End run op " << phi_op_name_ << " infer meta."; VLOG(6) << "Begin run op " << phi_op_name_ << " kernel."; - (*(phi_kernel_))(&(kernel_context_)); + (*(phi_kernel_))(&(tmp_kernel_context)); VLOG(6) << "End run op " << phi_op_name_ << " kernel."; } } diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc index 54ee746726e7e..96f21e1a534c0 100644 --- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc +++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc @@ -431,12 +431,24 @@ void analyse_event_info_for_two_instructions( if (has_data_dependency( instructions[cur_instr_id], instructions[next_instr_id]) || - !run_type_info[next_instr_id][DownstreamRunType::kEventRun].empty() || instructions[next_instr_id]->OpBase()->Type() == "depend") { waiter_instr_ids->insert(next_instr_id); return; } + if (!run_type_info[next_instr_id][DownstreamRunType::kEventRun].empty()) { + auto& next_next_instructor_ids = + run_type_info[next_instr_id][DownstreamRunType::kEventRun]; + for (auto& id : next_next_instructor_ids) { + if (has_data_dependency( + instructions[cur_instr_id], instructions[id])) { + waiter_instr_ids->insert(next_instr_id); + return; + } + } + return; + } + // NOTE(Ruibiao): If no data dependency from cur_instr to next_instr, and // simultaneously next_instr has no event_run downstream instr, we try to // recursively add events between cur_instr and next_instr's @@ -491,12 +503,25 @@ void analyse_event_info_for_two_instructions< if (has_data_dependency( instructions[cur_instr_id], instructions[next_instr_id]) || - !run_type_info[next_instr_id][DownstreamRunType::kEventRun].empty() || instructions[next_instr_id]->Name() == "pd_op.depend") { waiter_instr_ids->insert(next_instr_id); return; } + if (!run_type_info[next_instr_id][DownstreamRunType::kEventRun].empty()) { + auto& next_next_instructor_ids = + run_type_info[next_instr_id][DownstreamRunType::kEventRun]; + for (auto& id : next_next_instructor_ids) { + if (has_data_dependency( + instructions[cur_instr_id], instructions[id])) { + waiter_instr_ids->insert(next_instr_id); + return; + } + } + + return; + } + // NOTE(Ruibiao): If no data dependency from cur_instr to next_instr, and // simultaneously next_instr has no event_run downstream instr, we try to // recursively add events between cur_instr and next_instr's diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 416d46c01e1f2..d5fe408d53401 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -35,8 +35,7 @@ PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope, "Use local_scope in new executor(especially used " "in UT), can turn off for better performance"); -namespace paddle { -namespace framework { +namespace paddle::framework { InterpreterCore::InterpreterCore(const platform::Place& place, const BlockDesc& block, @@ -170,5 +169,4 @@ Variable* InterpreterCore::DebugVar(const std::string& name) const { return impl_->DebugVar(name); } -} // namespace framework -} // namespace paddle +} // namespace paddle::framework diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index 4e4b41579f4fe..3374d38ccaae6 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -13,14 +13,14 @@ // limitations under the License. #include "paddle/fluid/framework/new_executor/standalone_executor.h" #include "paddle/common/flags.h" +#include "paddle/fluid/framework/feed_hook.h" #include "paddle/fluid/framework/new_executor/feed_fetch_utils.h" #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h" #include "paddle/fluid/framework/new_executor/pir_interpreter.h" #include "paddle/fluid/framework/new_executor/program_interpreter.h" #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h" -#include "paddle/fluid/platform/profiler/event_tracing.h" - #include "paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/ir_adaptor/translator/translate.h" #include "paddle/fluid/pir/transforms/general/inplace_pass.h" @@ -66,6 +66,7 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place, std::shared_ptr<::pir::Program> ir_program = nullptr; if (FLAGS_enable_pir_api || FLAGS_enable_pir_in_executor) { // NOLINT ir_program = plan_.IrProgram(job_type); + RunFeedHooks(*ir_program, *scope); } else { // NOTE (liuchenghao): std::make_shared will duplicate ProgramDesc object, // maybe std::make_unique is better? diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 9f4f46c60cea4..2a39e664276ed 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -39,8 +39,7 @@ COMMON_DECLARE_bool(check_nan_inf); PD_DECLARE_bool(benchmark); COMMON_DECLARE_bool(run_kp_kernel); -namespace paddle { -namespace imperative { +namespace paddle::imperative { static const phi::Kernel empty_kernel; static const framework::RuntimeContext empty_ctx({}, {}); @@ -752,5 +751,4 @@ void PreparedOp::Run(const NameVarMap& ins, } } -} // namespace imperative -} // namespace paddle +} // namespace paddle::imperative diff --git a/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc b/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc index aaf9439d2b9ed..e8b8c27a24e58 100644 --- a/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc +++ b/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc @@ -20,9 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/scope.h" -namespace paddle { -namespace inference { -namespace analysis { +namespace paddle::inference::analysis { void SaveOptimizedModelPass::SaveOptimizedModel(Argument* argument) { std::string model_opt_cache_dir = argument->optimized_model_save_path(); @@ -137,6 +135,4 @@ std::string SaveOptimizedModelPass::repr() const { return "save_optimized_model_pass"; } -} // namespace analysis -} // namespace inference -} // namespace paddle +} // namespace paddle::inference::analysis diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index adb7021633b8e..7a211edc2a699 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -28,6 +28,7 @@ #include "paddle/fluid//platform/device/gpu/gpu_types.h" #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/feed_hook.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/naive_executor.h" @@ -1444,7 +1445,9 @@ bool AnalysisPredictor::Run(const std::vector &inputs, LOG(ERROR) << "fail to set feed"; return false; } - + if (config_.new_ir_enabled()) { + ::paddle::framework::RunFeedHooks(*pir_program_, *scope); + } #ifdef PADDLE_WITH_TENSORRT if (config_.tensorrt_engine_enabled()) { inference::tensorrt::TensorRTEngine::predictor_id_per_thread = @@ -1519,7 +1522,9 @@ bool AnalysisPredictor::Run(const std::vector &inputs, LOG(ERROR) << "fail to set feed"; return false; } - + if (config_.new_ir_enabled()) { + ::paddle::framework::RunFeedHooks(*pir_program_, *scope); + } #ifdef PADDLE_WITH_TENSORRT if (config_.tensorrt_engine_enabled()) { inference::tensorrt::TensorRTEngine::predictor_id_per_thread = diff --git a/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat b/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat index 6eb932a190654..4bb859becf70c 100644 --- a/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat +++ b/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat @@ -65,12 +65,12 @@ if /i "%use_gpu%"=="Y" ( set use_gpu=N ) -rem set_path_vs_command_prompt +rem set_path_vs_command_prompt :set_vcvarsall_dir SET /P vcvarsall_dir="Please input the path of visual studio command Prompt, such as C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat =======>" set tmp_var=!vcvarsall_dir! call:remove_space -set vcvarsall_dir=!tmp_var! +set vcvarsall_dir=!tmp_var! IF NOT EXIST "%vcvarsall_dir%" ( echo "------------%vcvarsall_dir% not exist------------" goto set_vcvarsall_dir @@ -104,18 +104,18 @@ if EXIST "%source_path%\%model_name%.tar.gz" ( SET /P python_path="Please input the path of python.exe, such as C:\Python37\python.exe =======>" set tmp_var=!python_path! call:remove_space - set python_path=!tmp_var! + set python_path=!tmp_var! if "!python_path!"=="" ( set python_path=python.exe ) else ( if NOT exist "!python_path!" ( - echo "------------!python_path! not exist------------" + echo "------------!python_path! not exist------------" goto:eof - ) + ) ) md %source_path%\%model_name% !python_path! %source_path%\untar_model.py %source_path%\%model_name%.tar.gz %source_path%\%model_name% - + SET error_code=N if "%model_name%"=="mobilenet" ( if NOT EXIST "%source_path%\%model_name%\model" set error_code=Y @@ -127,7 +127,7 @@ if EXIST "%source_path%\%model_name%.tar.gz" ( del /f /s /q "%source_path%\%model_name%\*.*" >nul 2>&1 rd /s /q "%source_path%\%model_name%" >nul 2>&1 goto:eof - ) + ) ) ) @@ -201,7 +201,7 @@ if /i "%use_gpu%"=="Y" ( ) if exist "%build_path%\Release\%demo_name%.exe" ( - cd %build_path%\Release + cd %build_path%\Release set GLOG_v=4 if "%demo_name%"=="simple_on_word2vec" ( %demo_name%.exe --dirname="%source_path%\%model_name%\%model_name%" --use_gpu="%use_gpu%" diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index 57f8066df1eeb..d8206093efa53 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -60,11 +60,6 @@ void Tensor::Reshape(const std::vector &shape) { "No tensor called [%s] in the runtime scope", name_)); auto *tensor = var->GetMutable(); tensor->Resize(common::make_ddim(shape)); -#ifdef PADDLE_WITH_DNNL - if (tensor->layout() == phi::DataLayout::ONEDNN) { - tensor->set_layout(phi::DataLayout::ANY); - } -#endif } void Tensor::ReshapeStrings(const size_t &shape) { @@ -212,11 +207,6 @@ void Tensor::CopyFromCpu(const T *data) { if (place_ == PlaceType::kCPU) { auto *t_data = tensor->mutable_data(paddle::platform::CPUPlace()); std::memcpy(static_cast(t_data), data, ele_size); -#ifdef PADDLE_WITH_DNNL - if (tensor->layout() == phi::DataLayout::ONEDNN) { - tensor->set_layout(phi::DataLayout::ANY); - } -#endif } else if (place_ == PlaceType::kGPU) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index a296074f9d6cf..45c2d5607afde 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -538,6 +538,7 @@ XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) { "group_norm_silu_xpu_fuse_pass", "embedding_with_eltwise_add_xpu_fuse_pass", "qk_qkv_attention_xpu_fuse_pass", + "block_multihead_attention_xpu_pass", "multi_encoder_xpu_fuse_pass", "multi_encoder_xpu_adaptive_seqlen_fuse_pass", "multi_encoder_xpu_slice_fuse_pass", @@ -613,11 +614,13 @@ const std::vector kPirGpuPasses{ "fused_weight_only_linear_pass", "matmul_add_act_fuse_pass", "fc_elementwise_layernorm_fuse_pass", + "add_norm_fuse_pass", "matmul_scale_fuse_pass", "matmul_transpose_fuse_pass", "transpose_flatten_concat_fuse_pass", "remove_redundant_transpose_pass", - "transfer_layout_pass"}; + "transfer_layout_pass", +}; const std::vector kPirXpuPasses{// Functional pass "map_op_to_another_pass", diff --git a/paddle/fluid/inference/paddle_inference.map b/paddle/fluid/inference/paddle_inference.map index 267dcf7fb601d..180d4e643ba23 100644 --- a/paddle/fluid/inference/paddle_inference.map +++ b/paddle/fluid/inference/paddle_inference.map @@ -71,7 +71,7 @@ /* *paddle::framework*; */ *paddle::framework::InitDevices*; *paddle::framework::InitMemoryMethod*; - + *paddle::framework::InterpreterCore*; *paddle::framework::Executor*; *paddle::framework::proto*; diff --git a/paddle/fluid/inference/tensorrt/convert/conv3d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv3d_op.cc index 37a53d31f47b5..547ec74c19fa6 100644 --- a/paddle/fluid/inference/tensorrt/convert/conv3d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/conv3d_op.cc @@ -14,9 +14,7 @@ limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" -namespace paddle { -namespace inference { -namespace tensorrt { +namespace paddle::inference::tensorrt { template void ConvertConv3d(TensorRTEngine* engine, @@ -192,9 +190,7 @@ class Deconv3dOpConverter : public OpConverter { } }; -} // namespace tensorrt -} // namespace inference -} // namespace paddle +} // namespace paddle::inference::tensorrt REGISTER_TRT_OP_CONVERTER(conv3d, Conv3dOpConverter); REGISTER_TRT_OP_CONVERTER(conv3d_transpose, Deconv3dOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc index d3fda4cb24e28..f505c36b2ed5c 100644 --- a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc @@ -14,9 +14,7 @@ limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" -namespace paddle { -namespace inference { -namespace tensorrt { +namespace paddle::inference::tensorrt { // LeakyRelu converter from fluid to tensorRT class LeakyReluOpConverter : public OpConverter { @@ -121,8 +119,6 @@ class LeakyReluOpConverter : public OpConverter { } }; -} // namespace tensorrt -} // namespace inference -} // namespace paddle +} // namespace paddle::inference::tensorrt REGISTER_TRT_OP_CONVERTER(leaky_relu, LeakyReluOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/matrix_multiply_op.cc b/paddle/fluid/inference/tensorrt/convert/matrix_multiply_op.cc index 16d6f3f20750c..fd72f8b78f9af 100644 --- a/paddle/fluid/inference/tensorrt/convert/matrix_multiply_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/matrix_multiply_op.cc @@ -12,9 +12,7 @@ limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/phi/common/data_type.h" -namespace paddle { -namespace inference { -namespace tensorrt { +namespace paddle::inference::tensorrt { /* * After trt_map_ops_to_matrix_multiply_pass(mul, matmul, matmul_v2 -> @@ -266,8 +264,6 @@ class MatrixMultiplyOpConverter : public OpConverter { } }; -} // namespace tensorrt -} // namespace inference -} // namespace paddle +} // namespace paddle::inference::tensorrt REGISTER_TRT_OP_CONVERTER(matrix_multiply, MatrixMultiplyOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/multiclass_nms3_op.cc b/paddle/fluid/inference/tensorrt/convert/multiclass_nms3_op.cc index 107217477d14f..f2d00ab4b4667 100644 --- a/paddle/fluid/inference/tensorrt/convert/multiclass_nms3_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/multiclass_nms3_op.cc @@ -13,9 +13,7 @@ limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" -namespace paddle { -namespace inference { -namespace tensorrt { +namespace paddle::inference::tensorrt { class MultiClassNMS3OpConverter : public OpConverter { public: @@ -170,8 +168,6 @@ class MultiClassNMS3OpConverter : public OpConverter { } }; -} // namespace tensorrt -} // namespace inference -} // namespace paddle +} // namespace paddle::inference::tensorrt REGISTER_TRT_OP_CONVERTER(multiclass_nms3, MultiClassNMS3OpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/transformer_input_convert_op.cc b/paddle/fluid/inference/tensorrt/convert/transformer_input_convert_op.cc index 1dca9bb818c38..f7fda67a3643f 100644 --- a/paddle/fluid/inference/tensorrt/convert/transformer_input_convert_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/transformer_input_convert_op.cc @@ -15,9 +15,7 @@ limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/plugin/transformer_input_output_convert_plugin.h" -namespace paddle { -namespace inference { -namespace tensorrt { +namespace paddle::inference::tensorrt { /* * Convert Transformer Input(pos_id, max_seqlen). @@ -58,8 +56,6 @@ class TransformerInputConvert : public OpConverter { } }; -} // namespace tensorrt -} // namespace inference -} // namespace paddle +} // namespace paddle::inference::tensorrt REGISTER_TRT_OP_CONVERTER(transformer_input_convert, TransformerInputConvert); diff --git a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc index d87c9af8cfa67..ae12901e7da90 100644 --- a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc @@ -29,9 +29,7 @@ limitations under the License. */ #include "paddle/phi/common/float16.h" using float16 = phi::dtype::float16; -namespace paddle { -namespace inference { -namespace tensorrt { +namespace paddle::inference::tensorrt { class TensorRTDynamicShapeValueEngineTest : public ::testing::Test { public: @@ -1049,6 +1047,4 @@ TEST_F(TensorRTDynamicShapeGNTest, test_trt_dynamic_shape_groupnorm) { } */ #endif -} // namespace tensorrt -} // namespace inference -} // namespace paddle +} // namespace paddle::inference::tensorrt diff --git a/paddle/fluid/ir_adaptor/translator/op_compat_info.cc.j2 b/paddle/fluid/ir_adaptor/translator/op_compat_info.cc.j2 index e7b7812fe61be..71c38e487c909 100644 --- a/paddle/fluid/ir_adaptor/translator/op_compat_info.cc.j2 +++ b/paddle/fluid/ir_adaptor/translator/op_compat_info.cc.j2 @@ -2,7 +2,7 @@ namespace paddle { namespace translator { - + OpNameNormalizer::OpNameNormalizer() { op_name_mappings = { {% for legacy_name, normalized_name in op_name_pairs.items() %} @@ -11,35 +11,35 @@ OpNameNormalizer::OpNameNormalizer() { }; op_arg_name_mappings = { {% for op_name, arg_name_mappings in op_arg_name_pairs.items() %} - { - "{{op_name}}", + { + "{{op_name}}", { {% for normalized_name, legacy_name in arg_name_mappings.items() %} { "{{normalized_name}}", "{{legacy_name}}" }, {% endfor %} - }, + }, }, {% endfor %} }; op_mutable_attributes = { {% for op_name, mutable_attributes in op_mutable_attributes.items() %} - { - "{{op_name}}", + { + "{{op_name}}", { {% for attribute_name in mutable_attributes %} "{{attribute_name}}", {% endfor %} - }, + }, }, {% endfor %} }; op_mutable_attribute_infos = { {% for op_name, mutable_attribute_infos in op_mutable_attribute_infos.items() %} - { - "{{op_name}}", + { + "{{op_name}}", { {% for attribute_name, attribute_info in mutable_attribute_infos.items() %} - { + { "{{attribute_name}}", { {% for candidate_var_name in attribute_info %} @@ -48,7 +48,7 @@ OpNameNormalizer::OpNameNormalizer() { }, }, {% endfor %} - }, + }, }, {% endfor %} }; diff --git a/paddle/fluid/jit/property.proto b/paddle/fluid/jit/property.proto index 5f89e1da90b91..a00da9fc6e40a 100644 --- a/paddle/fluid/jit/property.proto +++ b/paddle/fluid/jit/property.proto @@ -84,7 +84,7 @@ message TensorProto { // For int64. // When this field is present, the data_type field MUST be INT64 repeated int64 int64_data = 7 [packed = true]; - + // For double // Complex128 tensors are encoded as a single array of doubles, // with the real components appearing in odd numbered positions, @@ -130,16 +130,16 @@ message ValueProto { STRINGS = 8; TENSORS = 9; } - optional string name = 1; - + optional string name = 1; + optional AttributeType type = 2; // discriminator that indicates which field below is in use - + // Exactly ONE of the following fields must be present optional float f = 3; // float optional int64 i = 4; // int optional bytes s = 5; // UTF-8 string optional TensorProto t = 6; // tensor value - + repeated float floats = 7; // list of floats repeated int64 ints = 8; // list of ints repeated bytes strings = 9; // list of UTF-8 strings @@ -147,5 +147,5 @@ message ValueProto { } message PropertyVals { - repeated ValueProto entrys=1; + repeated ValueProto entrys=1; } diff --git a/paddle/fluid/memory/allocation/cpu_allocator.cc b/paddle/fluid/memory/allocation/cpu_allocator.cc index 398c015627860..426eeeae70e55 100644 --- a/paddle/fluid/memory/allocation/cpu_allocator.cc +++ b/paddle/fluid/memory/allocation/cpu_allocator.cc @@ -19,9 +19,7 @@ #include "paddle/fluid/memory/stats.h" #include "paddle/fluid/platform/enforce.h" -namespace paddle { -namespace memory { -namespace allocation { +namespace paddle::memory::allocation { bool CPUAllocator::IsAllocThreadSafe() const { return true; } @@ -52,6 +50,4 @@ phi::Allocation *CPUAllocator::AllocateImpl(size_t size) { HOST_MEMORY_STAT_UPDATE(Reserved, 0, size); return new Allocation(p, size, platform::CPUPlace()); } -} // namespace allocation -} // namespace memory -} // namespace paddle +} // namespace paddle::memory::allocation diff --git a/paddle/fluid/memory/allocation/memory_block_desc.cc b/paddle/fluid/memory/allocation/memory_block_desc.cc index d20d56a6d05e8..1d1f3c2396921 100644 --- a/paddle/fluid/memory/allocation/memory_block_desc.cc +++ b/paddle/fluid/memory/allocation/memory_block_desc.cc @@ -17,9 +17,7 @@ limitations under the License. */ #include "paddle/fluid/memory/allocation/memory_block.h" -namespace paddle { -namespace memory { -namespace detail { +namespace paddle::memory::detail { MemoryBlock::Desc::Desc(MemoryBlock::Type t, size_t i, @@ -74,6 +72,4 @@ bool MemoryBlock::Desc::CheckGuards() const { return guard_begin == hash(*this, 1) && guard_end == hash(*this, 2); } -} // namespace detail -} // namespace memory -} // namespace paddle +} // namespace paddle::memory::detail diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 4714f3a2eb446..fc28e02b7bdb9 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -82,7 +82,7 @@ endif() set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi common phi_utils static_prim_api get_expected_kernel_func) -register_operators(EXCLUDES py_func_op generated_op1 generated_op2 generated_op3 generated_op4 load_combine_op lstm_op run_program_op quantize_linear_op +register_operators(EXCLUDES py_func_op generated_op1 generated_op2 generated_op3 generated_op4 load_combine_op run_program_op quantize_linear_op save_combine_op sync_batch_norm_op activation_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS} processgroup_comm_utils) op_library(generated_op UNITY SRCS generated_op1.cc generated_op2.cc generated_op3.cc generated_op4.cc DEPS ${OP_HEADER_DEPS}) @@ -108,8 +108,6 @@ if (WITH_GPU OR WITH_ROCM) op_library(sync_batch_norm_op DEPS processgroup_comm_utils) endif() -op_library(lstm_op DEPS ${OP_HEADER_DEPS}) - set(COMMON_OP_DEPS ${OP_HEADER_DEPS}) if (WITH_DGC) diff --git a/paddle/fluid/operators/assign_pos_op.cc b/paddle/fluid/operators/assign_pos_op.cc deleted file mode 100644 index 7def3a0cac503..0000000000000 --- a/paddle/fluid/operators/assign_pos_op.cc +++ /dev/null @@ -1,81 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -class AssignPosOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK( - ctx->HasInput("cum_count"), "Input", "cum_count", "AssignPos"); - OP_INOUT_CHECK( - ctx->HasInput("eff_num_len"), "Input", "eff_num_len", "AssignPos"); - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "AssignPos"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "AssignPos"); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - auto cum_count_dtype = - OperatorWithKernel::IndicateVarDataType(ctx, "cum_count"); - auto X_dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X"); - - PADDLE_ENFORCE_EQ(cum_count_dtype, - X_dtype, - phi::errors::InvalidArgument( - "The dtype of the cum_count and X should be same")); - PADDLE_ENFORCE_EQ(cum_count_dtype, - framework::proto::VarType::INT64, - phi::errors::InvalidArgument( - "The dtype of the cum_count_dtype, eff_num_len and " - "X should be same as int64")); - return phi::KernelKey(cum_count_dtype, ctx.device_context().GetPlace()); - } -}; - -class AssignPosOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "numbers to scatter."); - AddInput("cum_count", "The cumulative sum count of numbers."); - AddInput("eff_num_len", - "The effective numbers of numbers should be scattered."); - AddOutput("Out", "Assemble numbers in the order of counters."); - - AddComment(R"DOC( -assign_pos_op Operator. - -Assign pos decides which tokens should be fetched belong to -specially counter orderingly. - -)DOC"); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_WITHOUT_GRADIENT(assign_pos, - ops::AssignPosOp, - ops::AssignPosOpMaker); diff --git a/paddle/fluid/operators/channel_shuffle_op.cc b/paddle/fluid/operators/channel_shuffle_op.cc deleted file mode 100644 index 69f75691a0318..0000000000000 --- a/paddle/fluid/operators/channel_shuffle_op.cc +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/phi/core/infermeta_utils.h" -#include "paddle/phi/infermeta/backward.h" -#include "paddle/phi/infermeta/unary.h" - -namespace paddle { -namespace operators { - -class ChannelShuffleOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; -}; - -class ChannelShuffleOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", - "(Tensor, default Tensor), " - "the input feature data of ChannelShuffleOp, the layout is " - "[N, C, H, W] or [N, H, W, C]."); - AddOutput("Out", - "(Tensor, default Tensor), the output of " - "ChannelShuffleOp. The layout is also [N, C, " - "H, W] or [N, H, W, C]."); - AddAttr("groups", "number of groups to divide channels in."); - AddAttr( - "data_format", - "An optional string from: \"NHWC\", \"NCHW\". " - "Defaults to \"NHWC\", Specify the data format of the input data.") - .SetDefault("NCHW"); - - AddComment(R"DOC( - Channel Shuffle operator - This operator divides channels in a tensor of shape :math:`(*, C, H, W)` - into :math:`g` groups and rearranges them as :math:`(*, C/g, g, H, W)` - while keeping the original tensor shape. - - Please refer to the paper: - `ShuffleNet: An Extremely Efficient Convolutional Neural Network for - Mobile Devices `_ - by Zhang et. al (2017) for more details. - - )DOC"); - } -}; - -class ChannelShuffleGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; -}; - -template -class ChannelShuffleGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr op) const override { - op->SetType("channel_shuffle_grad"); - op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - op->SetAttrMap(this->Attrs()); - op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -DECLARE_INFER_SHAPE_FUNCTOR(channel_shuffle, - ChannelShuffleInferShapeFunctor, - PD_INFER_META(phi::ChannelShuffleInferMeta)); - -REGISTER_OPERATOR(channel_shuffle, - ops::ChannelShuffleOp, - ops::ChannelShuffleOpMaker, - ops::ChannelShuffleGradOpMaker, - ops::ChannelShuffleGradOpMaker, - ChannelShuffleInferShapeFunctor); - -DECLARE_INFER_SHAPE_FUNCTOR(channel_shuffle_grad, - ChannelShuffleGradInferShapeFunctor, - PD_INFER_META(phi::ChannelShuffleGradInferMeta)); - -REGISTER_OPERATOR(channel_shuffle_grad, - ops::ChannelShuffleGradOp, - ChannelShuffleGradInferShapeFunctor); diff --git a/paddle/fluid/operators/collective/c_allreduce_avg_op.cc b/paddle/fluid/operators/collective/c_allreduce_avg_op.cc index 963ea26321bdb..13d07557f1e7c 100644 --- a/paddle/fluid/operators/collective/c_allreduce_avg_op.cc +++ b/paddle/fluid/operators/collective/c_allreduce_avg_op.cc @@ -14,17 +14,14 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_allreduce_op.h" -namespace paddle { -namespace framework { +namespace paddle::framework { class OpDesc; -} // namespace framework -namespace imperative { +} // namespace paddle::framework +namespace paddle::imperative { class OpBase; -} // namespace imperative -} // namespace paddle +} // namespace paddle::imperative -namespace paddle { -namespace operators { +namespace paddle::operators { class CAllReduceAvgOpMaker : public CAllReduceOpMaker { protected: @@ -33,8 +30,7 @@ class CAllReduceAvgOpMaker : public CAllReduceOpMaker { DECLARE_INPLACE_OP_INFERER(AllreduceAvgInplaceInferer, {"X", "Out"}); -} // namespace operators -} // namespace paddle +} // namespace paddle::operators namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/collective/partial_send_op.cc b/paddle/fluid/operators/collective/partial_send_op.cc index cf2a0ece1a7ab..961b8c4cf1382 100644 --- a/paddle/fluid/operators/collective/partial_send_op.cc +++ b/paddle/fluid/operators/collective/partial_send_op.cc @@ -14,8 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/partial_send_op.h" -namespace paddle { -namespace operators { +namespace paddle::operators { class PartialSendOp : public framework::OperatorWithKernel { public: @@ -84,8 +83,7 @@ Reference: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.h } }; -} // namespace operators -} // namespace paddle +} // namespace paddle::operators namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/compat/conv2d.pbtxt b/paddle/fluid/operators/compat/conv2d.pbtxt index b18e026499243..1b602fe43aab1 100644 --- a/paddle/fluid/operators/compat/conv2d.pbtxt +++ b/paddle/fluid/operators/compat/conv2d.pbtxt @@ -50,7 +50,7 @@ extra { attrs { name: "quantization_type" type: STRING - } + } attrs { name: "bit_length" type: INT diff --git a/paddle/fluid/operators/compat/conv2d_transpose.pbtxt b/paddle/fluid/operators/compat/conv2d_transpose.pbtxt index c805547e0143d..ed04ecc4b71ec 100644 --- a/paddle/fluid/operators/compat/conv2d_transpose.pbtxt +++ b/paddle/fluid/operators/compat/conv2d_transpose.pbtxt @@ -8,7 +8,7 @@ def { } inputs { name: "Bias" - } + } outputs { name: "Output" } diff --git a/paddle/fluid/operators/compat/conv2d_transpose_bias.pbtxt b/paddle/fluid/operators/compat/conv2d_transpose_bias.pbtxt index bce4fc9f0e114..93bf29b8b394a 100644 --- a/paddle/fluid/operators/compat/conv2d_transpose_bias.pbtxt +++ b/paddle/fluid/operators/compat/conv2d_transpose_bias.pbtxt @@ -8,7 +8,7 @@ def { } inputs { name: "Bias" - } + } outputs { name: "Output" } diff --git a/paddle/fluid/operators/compat/depthwise_conv2d.pbtxt b/paddle/fluid/operators/compat/depthwise_conv2d.pbtxt index ee04cd73dd70c..a0d80211c2594 100644 --- a/paddle/fluid/operators/compat/depthwise_conv2d.pbtxt +++ b/paddle/fluid/operators/compat/depthwise_conv2d.pbtxt @@ -42,7 +42,7 @@ extra { attrs { name: "quantization_type" type: STRING - } + } attrs { name: "bit_length" type: INT diff --git a/paddle/fluid/operators/compat/fused_transpose.pbtxt b/paddle/fluid/operators/compat/fused_transpose.pbtxt index e4c7c218cc117..677d2e5792f75 100644 --- a/paddle/fluid/operators/compat/fused_transpose.pbtxt +++ b/paddle/fluid/operators/compat/fused_transpose.pbtxt @@ -17,26 +17,26 @@ def { extra { attrs{ name: "fused_squeeze2_axes" - type: INTS + type: INTS } attrs{ name: "fused_unsqueeze2_axes" - type: INTS + type: INTS } attrs{ name: "fused_reshape2_shape" - type: INTS + type: INTS } attrs{ name: "scale" - type: FLOAT + type: FLOAT } attrs{ name: "shift" - type: FLOAT + type: FLOAT } attrs{ name: "output_data_type" - type: STRING + type: STRING } } diff --git a/paddle/fluid/operators/compat/mul.pbtxt b/paddle/fluid/operators/compat/mul.pbtxt index 056f799c6c49c..28b40d0e6526c 100644 --- a/paddle/fluid/operators/compat/mul.pbtxt +++ b/paddle/fluid/operators/compat/mul.pbtxt @@ -22,7 +22,7 @@ extra { attrs { name: "Out0_threshold" type: FLOAT - } + } attrs { name: "bit_length" type: INT @@ -30,7 +30,7 @@ extra { attrs { name: "quantization_type" type: STRING - } + } attrs { name: "skip_quant" type: BOOLEAN diff --git a/paddle/fluid/operators/compat/sequence_conv.pbtxt b/paddle/fluid/operators/compat/sequence_conv.pbtxt index c5335a25c557a..679b1095a57ba 100644 --- a/paddle/fluid/operators/compat/sequence_conv.pbtxt +++ b/paddle/fluid/operators/compat/sequence_conv.pbtxt @@ -23,7 +23,7 @@ def { attrs { name: "contextStride" type: INT - } + } } extra { attrs { @@ -49,5 +49,5 @@ extra { attrs { name: "op_device" type: STRING - } + } } diff --git a/paddle/fluid/operators/custom_device_common_op_registry.cc b/paddle/fluid/operators/custom_device_common_op_registry.cc index de04cb0e3bba5..ffdb3f01454a2 100644 --- a/paddle/fluid/operators/custom_device_common_op_registry.cc +++ b/paddle/fluid/operators/custom_device_common_op_registry.cc @@ -1366,7 +1366,10 @@ void RegisterCustomDeviceCommonKernel(const std::string& dev_type) { float>, paddle::operators::CConcatOpCustomDeviceKernel< paddle::platform::CustomDeviceContext, - phi::dtype::float16>); + phi::dtype::float16>, + paddle::operators::CConcatOpCustomDeviceKernel< + paddle::platform::CustomDeviceContext, + phi::dtype::bfloat16>); REGISTER_OP_CUSTOM_DEVICE_KERNEL( c_split, device_type, @@ -1378,7 +1381,10 @@ void RegisterCustomDeviceCommonKernel(const std::string& dev_type) { int>, paddle::operators::CSplitOpCustomDeviceKernel< paddle::platform::CustomDeviceContext, - phi::dtype::float16>); + phi::dtype::float16>, + paddle::operators::CSplitOpCustomDeviceKernel< + paddle::platform::CustomDeviceContext, + phi::dtype::bfloat16>); REGISTER_OP_CUSTOM_DEVICE_KERNEL( c_embedding, device_type, diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index d9bb602338352..8489b9b6c0e28 100755 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -6,7 +6,6 @@ endif() register_operators( EXCLUDES fused_bn_activation_op - fusion_group_op fusion_lstm_op fused_bn_add_activation_op fused_attention_op @@ -38,10 +37,6 @@ if(WITH_GPU OR WITH_ROCM) # HIP not support cudnnTransformTensor # HIP not support cudnnConvolutionBiasActivationForward op_library(fused_gate_attention_op) - # fusion_group - if(NOT APPLE AND NOT WIN32) - op_library(fusion_group_op) - endif() # fused_bn_add_activation # HIP not support bn act fuse in MIOPEN if((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7401)) diff --git a/paddle/fluid/operators/fused/fusion_group_op.cc b/paddle/fluid/operators/fused/fusion_group_op.cc deleted file mode 100644 index b42dd927c6e31..0000000000000 --- a/paddle/fluid/operators/fused/fusion_group_op.cc +++ /dev/null @@ -1,73 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/infermeta/multiary.h" - -namespace paddle { -namespace operators { - -class FusionGroupOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return phi::KernelKey(framework::proto::VarType::FP32, phi::GPUPlace(0)); - }; -}; - -class FusionGroupOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("Inputs", - "(std::vector) The inputs of fusion_group op.") - .AsDuplicable(); - AddOutput("Outs", - "(std::vector) The outputs of fusion_group op.") - .AsDuplicable(); - AddAttr>("outs_dtype", - "The data type of Outputs in fusion_group op.") - .SetDefault({}); - AddAttr>("inputs_dtype", - "The data type of Inputs in fusion_group op.") - .SetDefault({}); - AddAttr("type", "Fusion type.").SetDefault(0); - AddAttr("func_name", "Name of the generated functions.") - .SetDefault(""); - AddComment(R"DOC( -fusion_group Operator. - -It is used to execute a generated CUDA kernel which fuse the computation of -multiple operators into one. It supports several types: -0, fused computation of elementwise operations in which all the dims of inputs - and outputs should be exactly the same. -)DOC"); - } -}; - -} // namespace operators -} // namespace paddle - -DECLARE_INFER_SHAPE_FUNCTOR(fusion_group, - FusionGroupInferShapeFunctor, - PD_INFER_META(phi::FusionGroupInferMeta)); - -namespace ops = paddle::operators; -REGISTER_OPERATOR(fusion_group, - ops::FusionGroupOp, - ops::FusionGroupOpMaker, - FusionGroupInferShapeFunctor); diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc deleted file mode 100644 index ac5cb81c060f0..0000000000000 --- a/paddle/fluid/operators/lstm_op.cc +++ /dev/null @@ -1,365 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/lstm_op.h" - -#include -#include - -namespace paddle { -namespace operators { - -class LSTMOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "LSTM"); - OP_INOUT_CHECK(ctx->HasInput("Weight"), "Input", "Weight", "LSTM"); - OP_INOUT_CHECK(ctx->HasInput("Bias"), "Input", "Bias", "LSTM"); - - OP_INOUT_CHECK(ctx->HasOutput("Hidden"), "Output", "Hidden", "LSTM"); - OP_INOUT_CHECK(ctx->HasOutput("Cell"), "Output", "Cell", "LSTM"); - - bool is_test = ctx->Attrs().Get("is_test"); - - if (!is_test) { - OP_INOUT_CHECK( - ctx->HasOutput("BatchGate"), "Output", "BatchGate", "LSTM"); - OP_INOUT_CHECK(ctx->HasOutput("BatchCellPreAct"), - "Output", - "BatchCellPreAct", - "LSTM"); - } - auto in_dims = ctx->GetInputDim("Input"); - PADDLE_ENFORCE_EQ( - in_dims.size(), - 2, - phi::errors::InvalidArgument( - "Input(X)'s rank must be 2, but received %d.", in_dims.size())); - - if (ctx->HasInput("H0")) { - PADDLE_ENFORCE_EQ( - ctx->HasInput("C0"), - true, - phi::errors::NotFound("Input(Cell) and Input(Hidden) of LSTM " - "should not be null at the same time.")); - auto h_dims = ctx->GetInputDim("H0"); - auto c_dims = ctx->GetInputDim("C0"); - PADDLE_ENFORCE_EQ(h_dims, - c_dims, - phi::errors::InvalidArgument( - "The dimension of Input(H0) and Input(C0) should " - "be the same, but received [%s] (H0) vs [%s] (C0).", - h_dims, - c_dims)); - } - - int frame_size = static_cast(in_dims[1] / 4); - auto w_dims = ctx->GetInputDim("Weight"); - PADDLE_ENFORCE_EQ( - w_dims.size(), - 2, - phi::errors::InvalidArgument( - "The rank of Input(Weight) should be 2, but received %d.", - w_dims.size())); - PADDLE_ENFORCE_EQ(w_dims[0], - frame_size, - phi::errors::InvalidArgument( - "The first dimension of Input(Weight) should be %d, " - "but received %d.", - frame_size, - w_dims[0])); - PADDLE_ENFORCE_EQ(w_dims[1], - 4 * frame_size, - phi::errors::InvalidArgument( - "The second dimension of Input(Weight) should be 4 * " - "%d, but received %d.", - frame_size, - w_dims[1])); - - auto b_dims = ctx->GetInputDim("Bias"); - PADDLE_ENFORCE_EQ( - b_dims.size(), - 2, - phi::errors::InvalidArgument( - "The rank of Input(Bias) should be 2, but received %d.", - b_dims.size())); - PADDLE_ENFORCE_EQ( - b_dims[0], - 1, - phi::errors::InvalidArgument( - "The first dimension of Input(Bias) should be 1, but received %d.", - b_dims[0])); - - if (ctx->Attrs().Get("use_peepholes")) { - PADDLE_ENFORCE_EQ( - b_dims[1], - 7 * frame_size, - phi::errors::InvalidArgument( - "The second dimension of Input(Bias) should be 7 * %d if enable " - "peepholes connection, but received %d.", - frame_size, - b_dims[1])); - } else { - PADDLE_ENFORCE_EQ( - b_dims[1], - 4 * frame_size, - phi::errors::InvalidArgument( - "The second dimension of Input(Bias) should be 4 * %d if disable " - "peepholes connection, but received %d.", - frame_size, - b_dims[1])); - } - - phi::DDim out_dims({in_dims[0], frame_size}); - ctx->SetOutputDim("Hidden", out_dims); - ctx->SetOutputDim("Cell", out_dims); - if (!is_test) { - ctx->SetOutputDim("BatchGate", in_dims); - ctx->SetOutputDim("BatchCellPreAct", out_dims); - } - ctx->ShareLoD("Input", "Hidden"); - ctx->ShareLoD("Input", "Cell"); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "Input"), - ctx.device_context().GetPlace()); - } -}; - -class LSTMOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput( - "Input", - "(phi::DenseTensor) the first input is a phi::DenseTensor, which " - "support variable-time length input sequence. The underlying tensor in " - "this phi::DenseTensor is a matrix with shape (T X 4D), where T is the " - "total time steps in this mini-batch, D is the hidden size."); - AddInput("H0", - "(Tensor, optional) the initial hidden state is an optional " - "input. This is a tensor with shape (N x D), where N is the " - "batch size and D is the hidden size.") - .AsDispensable(); - AddInput("C0", - "(Tensor, optional) the initial cell state is an optional " - "input. This is a tensor with shape (N x D), where N is the " - "batch size. `H0` and `C0` can be NULL but only at the same time.") - .AsDispensable(); - AddInput("Weight", - "(Tensor) the learnable hidden-hidden weights." - " - The shape is (D x 4D), where D is the hidden size. " - " - Weight = {W_ch, W_ih, W_fh, W_oh}"); - AddInput("Bias", - "(Tensor) the learnable weights, which contains two parts: " - "input-hidden bias weight and peephole connections weight if " - "setting `use_peepholes` True. " - "1. `use_peepholes = False` " - " - The shape is (1 x 4D). " - " - Bias = {b_c, b_i, b_f, b_o}." - "2. `use_peepholes = True` " - " - The shape is (1 x 7D). " - " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}."); - AddOutput("Hidden", - "(phi::DenseTensor) the hidden state of LSTM operator. " - "The shape is (T x D), and lod is the same with the `Input`."); - AddOutput("Cell", - "(phi::DenseTensor) the cell state of LSTM operator. " - "The shape is (T x D), and lod is the same with the `Input`."); - AddOutput( - "BatchGate", - "(phi::DenseTensor) This phi::DenseTensor contains input gate, forget " - "gate " - "and output gate after the nonlinear computation. This " - "phi::DenseTensor has the same shape as the reorganized input, which " - "is also be called batch input. The LoD size is 2. The first " - "LoD is the batch offsets and the second LoD contains the " - "indexes, which denote the position of reorganized sequence " - "in the raw input.") - .AsIntermediate() - .AsExtra(); - AddOutput("BatchCellPreAct", - "(phi::DenseTensor) This phi::DenseTensor is obtained in the " - "forward and used " - "in the backward.") - .AsIntermediate() - .AsExtra(); - AddAttr("use_peepholes", - "(bool, default: True) " - "whether to enable diagonal/peephole connections.") - .SetDefault(true); - AddAttr("is_reverse", - "(bool, default: False) " - "whether to compute reversed LSTM.") - .SetDefault(false); - AddAttr("is_test", "True if in test phase.").SetDefault(false); - AddAttr( - "gate_activation", - "(string, default: sigmoid)" - "The activation for input gate, forget gate and output " - "gate, `sigmoid` by default.") - .SetDefault("sigmoid") - .InEnum({"sigmoid", "tanh", "relu", "identity"}); - AddAttr("cell_activation", - "(string, default: tanh)" - "The activation for cell output, `tanh` by default.") - .SetDefault("tanh") - .InEnum({"sigmoid", "tanh", "relu", "identity"}); - AddAttr("candidate_activation", - "(string, default: tanh)" - "The activation for candidate hidden state, " - "`tanh` by default.") - .SetDefault("tanh") - .InEnum({"sigmoid", "tanh", "relu", "identity"}); - AddComment(R"DOC( -Long-Short Term Memory (LSTM) Operator. - -The default implementation is diagonal/peephole connection -(https://arxiv.org/pdf/1402.1128.pdf), the formula is as follows: - -$$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i) $$ - -$$ f_t = \\sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f) $$ - -$$ \\tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c) $$ - -$$ o_t = \\sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o) $$ - -$$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$ - -$$ h_t = o_t \\odot act_h(c_t) $$ - -- W terms denote weight matrices (e.g. $W_{xi}$ is the matrix - of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$ - are diagonal weight matrices for peephole connections. In our implementation, - we use vectors to represent these diagonal weight matrices. -- The b terms denote bias vectors ($b_i$ is the input gate bias vector). -- $\sigma$ is the non-line activations, such as logistic sigmoid function. -- $i, f, o$ and $c$ are the input gate, forget gate, output gate, - and cell activation vectors, respectively, all of which have the same size as - the cell output activation vector $h$. -- The $\odot$ is the element-wise product of the vectors. -- $act_g$ and $act_h$ are the cell input and cell output activation functions - and `tanh` is usually used for them. -- $\tilde{c_t}$ is also called candidate hidden state, - which is computed based on the current input and the previous hidden state. - -Set `use_peepholes` False to disable peephole connection. The formula -is omitted here, please refer to the paper -http://www.bioinf.jku.at/publications/older/2604.pdf for details. - -Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$ -operations on the input $x_{t}$ are NOT included in this operator. -Users can choose to use fully-connect operator before LSTM operator. - -)DOC"); - } -}; - -class LSTMGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "LSTM@Grad"); - OP_INOUT_CHECK(ctx->HasInput("Hidden"), "Input", "Hidden", "LSTM@Grad"); - OP_INOUT_CHECK(ctx->HasInput("Cell"), "Input", "Cell", "LSTM@Grad"); - OP_INOUT_CHECK(ctx->HasInput("Weight"), "Input", "Weight", "LSTM@Grad"); - OP_INOUT_CHECK(ctx->HasInput("Bias"), "Input", "Bias", "LSTM@Grad"); - - OP_INOUT_CHECK( - ctx->HasInput("BatchGate"), "Input", "BatchGate", "LSTM@Grad"); - OP_INOUT_CHECK(ctx->HasInput("BatchCellPreAct"), - "Input", - "BatchCellPreAct", - "LSTM@Grad"); - - auto SetOutGradDim = [&ctx](const std::string& name) { - auto g_name = framework::GradVarName(name); - if (ctx->HasOutput(g_name)) - ctx->SetOutputDim(g_name, ctx->GetInputDim(name)); - }; - - SetOutGradDim("Input"); - SetOutGradDim("Weight"); - SetOutGradDim("Bias"); - SetOutGradDim("H0"); - SetOutGradDim("C0"); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "Input"), - ctx.device_context().GetPlace()); - } -}; - -template -class LSTMGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr op) const override { - op->SetType("lstm_grad"); - op->SetAttrMap(this->Attrs()); - op->SetInput("Input", this->Input("Input")); - op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input")); - - if (this->HasInput("H0")) { - op->SetInput("H0", this->Input("H0")); - op->SetOutput(framework::GradVarName("H0"), this->InputGrad("H0")); - } - - if (this->HasInput("C0")) { - op->SetInput("C0", this->Input("C0")); - op->SetOutput(framework::GradVarName("C0"), this->InputGrad("C0")); - } - - op->SetInput("Weight", this->Input("Weight")); - op->SetOutput(framework::GradVarName("Weight"), this->InputGrad("Weight")); - - op->SetInput("Bias", this->Input("Bias")); - op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias")); - - op->SetInput("Cell", this->Output("Cell")); - - op->SetInput("Hidden", this->Output("Hidden")); - op->SetInput(framework::GradVarName("Hidden"), this->OutputGrad("Hidden")); - - op->SetInput("BatchGate", this->Output("BatchGate")); - op->SetInput("BatchCellPreAct", this->Output("BatchCellPreAct")); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OPERATOR(lstm, - ops::LSTMOp, - ops::LSTMOpMaker, - ops::LSTMGradOpMaker, - ops::LSTMGradOpMaker); -REGISTER_OPERATOR(lstm_grad, ops::LSTMGradOp); - -PD_REGISTER_STRUCT_KERNEL( - lstm, CPU, ALL_LAYOUT, ops::LSTMKernel, float, double) {} -PD_REGISTER_STRUCT_KERNEL( - lstm_grad, CPU, ALL_LAYOUT, ops::LSTMGradKernel, float, double) {} diff --git a/paddle/fluid/operators/lstm_op.cu.cc b/paddle/fluid/operators/lstm_op.cu.cc deleted file mode 100644 index b06521088a95a..0000000000000 --- a/paddle/fluid/operators/lstm_op.cu.cc +++ /dev/null @@ -1,21 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/lstm_op.h" - -namespace ops = paddle::operators; -PD_REGISTER_STRUCT_KERNEL( - lstm, GPU, ALL_LAYOUT, ops::LSTMKernel, float, double) {} -PD_REGISTER_STRUCT_KERNEL( - lstm_grad, GPU, ALL_LAYOUT, ops::LSTMGradKernel, float, double) {} diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h deleted file mode 100644 index 9eaba45a2d597..0000000000000 --- a/paddle/fluid/operators/lstm_op.h +++ /dev/null @@ -1,444 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/detail/activation_functions.h" -#include "paddle/phi/kernels/funcs/lstm_compute.h" -#include "paddle/phi/kernels/funcs/sequence2batch.h" - -namespace paddle { -namespace operators { - -template -inline void ReorderInitState(const DeviceContext& ctx, - const phi::DenseTensor& src, - phi::Vector index_lod, - phi::DenseTensor* dst, - bool indexed_src) { - phi::funcs::CopyMatrixRowsFunctor row_shuffle; - dst->mutable_data(src.dims(), ctx.GetPlace()); - row_shuffle(ctx, src, index_lod, dst, indexed_src); -} - -template -class LSTMKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - bool is_test = ctx.Attr("is_test"); - - auto* input = ctx.Input("Input"); - auto* weight = ctx.Input("Weight"); - auto* bias = ctx.Input("Bias"); - - auto* hidden_t0 = ctx.Input("H0"); - auto* cell_t0 = ctx.Input("C0"); - - phi::DenseTensor* batch_gate = nullptr; - phi::DenseTensor batch_gate_temp; - if (is_test) { - batch_gate = &batch_gate_temp; - batch_gate->Resize(input->dims()); - } else { - batch_gate = ctx.Output("BatchGate"); - } - batch_gate->mutable_data(ctx.GetPlace()); - auto* hidden_out = ctx.Output("Hidden"); - hidden_out->mutable_data(ctx.GetPlace()); - auto* cell_out = ctx.Output("Cell"); - cell_out->mutable_data(ctx.GetPlace()); - - bool is_reverse = ctx.Attr("is_reverse"); - phi::funcs::LoDTensor2BatchFunctor to_batch; - auto& device_ctx = ctx.template device_context(); - to_batch(device_ctx, *input, batch_gate, true, is_reverse); - - auto in_dims = input->dims(); - int frame_size = static_cast(in_dims[1] / 4); - phi::DDim dims({in_dims[0], frame_size}); - - if (bias) { - phi::DenseTensor b = *bias; - b.Resize({bias->numel(), 1}); - phi::DenseTensor gate_bias = b.Slice(0, 4 * frame_size); - phi::funcs::RowwiseAdd add_bias; - add_bias(device_ctx, *batch_gate, gate_bias, batch_gate); - } - - phi::funcs::LstmMetaValue lstm_value; - if (bias && ctx.Attr("use_peepholes")) { - T* bias_data = const_cast(bias->data()); - // the code style in LstmMetaValue will be updated later. - - lstm_value.check_ig = bias_data + 4 * frame_size; - lstm_value.check_fg = lstm_value.check_ig + frame_size; - lstm_value.check_og = lstm_value.check_fg + frame_size; - } else { - lstm_value.check_ig = nullptr; - lstm_value.check_fg = nullptr; - lstm_value.check_og = nullptr; - } - lstm_value.prev_state_value = nullptr; - phi::DenseTensor ordered_c0; - - phi::Vector order(batch_gate->lod()[2]); - - if (cell_t0) { - // Since the batch computing for LSTM reorders the input sequence - // according to their length. The initialized cell state also needs - // to reorder. - ReorderInitState( - device_ctx, *cell_t0, order, &ordered_c0, true); - lstm_value.prev_state_value = ordered_c0.data(); - } - - // Use the local variable as here. - phi::DenseTensor batch_hidden, batch_cell, batch_cell_pre_act_temp; - phi::DenseTensor* batch_cell_pre_act; - if (is_test) { - batch_cell_pre_act = &batch_cell_pre_act_temp; - } else { - batch_cell_pre_act = ctx.Output("BatchCellPreAct"); - } - batch_hidden.mutable_data(dims, ctx.GetPlace()); - batch_cell.mutable_data(dims, ctx.GetPlace()); - batch_cell_pre_act->mutable_data(dims, ctx.GetPlace()); - - auto batch_starts = batch_gate->lod()[0]; - size_t num_batch = batch_starts.size() - 1; - auto gate_act = phi::funcs::detail::GetActivationType( - ctx.Attr("gate_activation")); - auto cell_act = phi::funcs::detail::GetActivationType( - ctx.Attr("cell_activation")); - auto cand_act = phi::funcs::detail::GetActivationType( - ctx.Attr("candidate_activation")); - - auto blas = phi::funcs::GetBlas(device_ctx); - for (size_t n = 0; n < num_batch; n++) { - int bstart = static_cast(batch_starts[n]); - int bend = static_cast(batch_starts[n + 1]); - - phi::DenseTensor gate_t = batch_gate->Slice(bstart, bend); - phi::DenseTensor out_t = batch_hidden.Slice(bstart, bend); - phi::DenseTensor cell_t = batch_cell.Slice(bstart, bend); - phi::DenseTensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend); - - int cur_batch_size = bend - bstart; - - if (n > 0) { - int pre_h_start = static_cast(batch_starts[n - 1]); - int pre_h_end = pre_h_start + cur_batch_size; - auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end); - blas.MatMul(pre_hidden_t, - false, - *weight, - false, - static_cast(1.0), - &gate_t, - static_cast(1.0)); - } else if (hidden_t0) { - // If n == 0 and there is no initialized hidden state, that is to say - // the H0 is zeros, the calculation W_h * H0 will be skiped. - // If n == 0 and there is initialized hidden state, calculate W_h * H0. - - // Since the batch computing for LSTM reorders the input sequence - // according to their length. The initialized hidden state also needs - // to reorder. - phi::DenseTensor ordered_h0; - ReorderInitState( - device_ctx, *hidden_t0, order, &ordered_h0, true); - blas.MatMul(ordered_h0, - false, - *weight, - false, - static_cast(1.0), - &gate_t, - static_cast(1.0)); - } - - lstm_value.gate_value = gate_t.data(); - lstm_value.output_value = out_t.data(); - lstm_value.state_value = cell_t.data(); - lstm_value.state_active_value = cell_pre_act_t.data(); - T cell_clip = 0.0; - phi::funcs::LstmUnitFunctor::compute(device_ctx, - lstm_value, - frame_size, - cur_batch_size, - cell_clip, - gate_act, - cell_act, - cand_act); - lstm_value.prev_state_value = lstm_value.state_value; - } - - phi::funcs::Batch2LoDTensorFunctor to_seq; - batch_hidden.set_lod(batch_gate->lod()); - // restore the output hidden in phi::DenseTensor from the batch hidden - to_seq(device_ctx, batch_hidden, hidden_out); - - batch_cell.set_lod(batch_gate->lod()); - // restore the output cell state in phi::DenseTensor from the batch cell - to_seq(device_ctx, batch_cell, cell_out); - } -}; - -template -class LSTMGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("Input"); - auto* weight = ctx.Input("Weight"); - auto* bias = ctx.Input("Bias"); - - auto* hidden_out = ctx.Input("Hidden"); - auto* cell_out = ctx.Input("Cell"); - - auto* batch_gate = ctx.Input("BatchGate"); - auto* batch_cell_pre_act = ctx.Input("BatchCellPreAct"); - - auto* hidden_g = - ctx.Input(framework::GradVarName("Hidden")); - - auto* in_g = ctx.Output(framework::GradVarName("Input")); - auto* weight_g = - ctx.Output(framework::GradVarName("Weight")); - auto* bias_g = ctx.Output(framework::GradVarName("Bias")); - - auto* h0 = ctx.Input("H0"); - auto* c0 = ctx.Input("C0"); - - auto* h0_g = ctx.Output(framework::GradVarName("H0")); - auto* c0_g = ctx.Output(framework::GradVarName("C0")); - - auto& device_ctx = ctx.template device_context(); - phi::funcs::SetConstant zero; - if (weight_g) { - weight_g->mutable_data(ctx.GetPlace()); - zero(device_ctx, weight_g, static_cast(0.0)); - } - - // ordered_h0/c0 is the reordered hidden/cell initialization. - // ordered_h0_g/c0_g is the reordered gradient of hidden/cell - // initialization. - phi::DenseTensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g; - phi::Vector order(batch_gate->lod()[2]); - - if (c0) { - ReorderInitState( - device_ctx, *c0, order, &ordered_c0, true); - } - if (c0 && c0_g) { - ordered_c0_g.mutable_data(c0_g->dims(), ctx.GetPlace()); - } - - auto in_dims = input->dims(); - auto out_dims = hidden_g->dims(); - int frame_size = static_cast(in_dims[1] / 4); - PADDLE_ENFORCE_EQ( - frame_size, - out_dims[1], - phi::errors::InvalidArgument( - "The second dimension of Input(" + - framework::GradVarName("Hidden") + - ") should be %d, but received %d in LSTM@Grad operator.", - frame_size, - out_dims[1])); - - phi::funcs::LstmMetaValue lstm_value; - if (bias && ctx.Attr("use_peepholes")) { - T* bias_data = const_cast(bias->data()); - lstm_value.check_ig = bias_data + 4 * frame_size; - lstm_value.check_fg = lstm_value.check_ig + frame_size; - lstm_value.check_og = lstm_value.check_fg + frame_size; - } else { - lstm_value.check_ig = nullptr; - lstm_value.check_fg = nullptr; - lstm_value.check_og = nullptr; - } - - phi::funcs::LstmMetaGrad lstm_grad; - - if (bias && bias_g) { - bias_g->mutable_data(ctx.GetPlace()); - zero(device_ctx, bias_g, static_cast(0.0)); - } - if (bias && bias_g && ctx.Attr("use_peepholes")) { - T* bias_g_data = bias_g->data(); - lstm_grad.check_ig_grad = bias_g_data + 4 * frame_size; - lstm_grad.check_fg_grad = lstm_grad.check_ig_grad + frame_size; - lstm_grad.check_og_grad = lstm_grad.check_fg_grad + frame_size; - } else { - lstm_grad.check_ig_grad = nullptr; - lstm_grad.check_fg_grad = nullptr; - lstm_grad.check_og_grad = nullptr; - } - - phi::funcs::LoDTensor2BatchFunctor to_batch; - - auto ToBatch = [&batch_gate, &to_batch](const DeviceContext& ctx, - const phi::DenseTensor& src, - const phi::DDim& dims, - phi::DenseTensor& dst) { - dst.mutable_data(dims, ctx.GetPlace()); - dst.set_lod(batch_gate->lod()); - to_batch(ctx, src, &dst, false); - }; - - phi::DenseTensor batch_hidden, batch_hidden_g, batch_cell; - ToBatch(device_ctx, *hidden_out, out_dims, batch_hidden); - ToBatch(device_ctx, *hidden_g, out_dims, batch_hidden_g); - ToBatch(device_ctx, *cell_out, out_dims, batch_cell); - - phi::DenseTensor batch_cell_g, batch_gate_g; - batch_cell_g.mutable_data(out_dims, ctx.GetPlace()); - // TODO(qingqing) support the case output cell has gradient. - // to_batch(device_ctx, *cell_g, batch_cell_g, false); - zero(device_ctx, &batch_cell_g, static_cast(0.0)); - batch_gate_g.mutable_data(batch_gate->dims(), ctx.GetPlace()); - batch_gate_g.set_lod(batch_gate->lod()); - - auto gate_act = phi::funcs::detail::GetActivationType( - ctx.Attr("gate_activation")); - auto cell_act = phi::funcs::detail::GetActivationType( - ctx.Attr("cell_activation")); - auto cand_act = phi::funcs::detail::GetActivationType( - ctx.Attr("candidate_activation")); - - auto batch_starts = batch_gate->lod()[0]; - size_t num_batch = batch_starts.size() - 1; - auto blas = phi::funcs::GetBlas(device_ctx); - for (int n = static_cast(num_batch) - 1; n >= 0; n--) { - int bstart = static_cast(batch_starts[n]); - int bend = static_cast(batch_starts[n + 1]); - - phi::DenseTensor gate = batch_gate->Slice(bstart, bend); - phi::DenseTensor cell = batch_cell.Slice(bstart, bend); - phi::DenseTensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend); - lstm_value.gate_value = gate.data(); - lstm_value.state_value = cell.data(); - lstm_value.state_active_value = cell_pre_act.data(); - - phi::DenseTensor out_g = batch_hidden_g.Slice(bstart, bend); - phi::DenseTensor gate_g = batch_gate_g.Slice(bstart, bend); - phi::DenseTensor cell_g = batch_cell_g.Slice(bstart, bend); - lstm_grad.state_grad = cell_g.data(); - lstm_grad.gate_grad = gate_g.data(); - lstm_grad.output_grad = out_g.data(); - - if (n > 0) { - int bstart_pre = static_cast(batch_starts[n - 1]); - phi::DenseTensor cell_pre = batch_cell.Slice(bstart_pre, bstart); - phi::DenseTensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart); - lstm_value.prev_state_value = cell_pre.data(); - lstm_grad.prev_state_grad = cell_pre_g.data(); - } else { - lstm_value.prev_state_value = c0 ? ordered_c0.data() : nullptr; - lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data() : nullptr; - } - - // lstm_value.output_value not used in bp, set to nullptr - // lstm_grad.state_active_grad not used in bp, set to nullptr - lstm_value.output_value = nullptr; - lstm_grad.state_active_grad = nullptr; - int cur_batch_size = bend - bstart; - T cell_clip = 0.0; - phi::funcs::LstmUnitGradFunctor::compute(device_ctx, - lstm_value, - lstm_grad, - frame_size, - cur_batch_size, - cell_clip, - gate_act, - cell_act, - cand_act); - - if (n > 0) { - int pre_h_start = static_cast(batch_starts[n - 1]); - int pre_h_end = pre_h_start + cur_batch_size; - auto pre_hidden_g = batch_hidden_g.Slice(pre_h_start, pre_h_end); - blas.MatMul(gate_g, - false, - *weight, - true, - static_cast(1.0), - &pre_hidden_g, - static_cast(1.0)); - if (weight_g) { - /* backward weight */ - auto pre_hidden = batch_hidden.Slice(pre_h_start, pre_h_end); - blas.MatMul(pre_hidden, - true, - gate_g, - false, - static_cast(1.0), - weight_g, - static_cast(1.0)); - } - } else { - if (h0 && weight_g) { - ReorderInitState( - device_ctx, *h0, order, &ordered_h0, true); - blas.MatMul(ordered_h0, - true, - gate_g, - false, - static_cast(1.0), - weight_g, - static_cast(1.0)); - } - if (h0 && h0_g) { - ordered_h0_g.mutable_data(h0_g->dims(), ctx.GetPlace()); - blas.MatMul(gate_g, - false, - *weight, - true, - static_cast(1.0), - &ordered_h0_g, - static_cast(0.0)); - } - } - } - - phi::funcs::Batch2LoDTensorFunctor to_seq; - if (in_g) { - /* backward data */ - in_g->mutable_data(ctx.GetPlace()); - to_seq(device_ctx, batch_gate_g, in_g); - } - if (bias && bias_g) { - /* backward bias */ - phi::DenseTensor b_g = *bias_g; - b_g.Resize({bias_g->numel(), 1}); - phi::DenseTensor gate_bias_g = b_g.Slice(0, 4 * frame_size); - phi::funcs::ColwiseSum col_sum; - col_sum(device_ctx, batch_gate_g, &gate_bias_g); - } - - if (h0 && h0_g) { - ReorderInitState( - device_ctx, ordered_h0_g, order, h0_g, false); - } - if (c0 && c0_g) { - ReorderInitState( - device_ctx, ordered_c0_g, order, c0_g, false); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/number_count_op.cc b/paddle/fluid/operators/number_count_op.cc deleted file mode 100644 index 7fb293891d3a5..0000000000000 --- a/paddle/fluid/operators/number_count_op.cc +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/number_count_op.h" - -namespace paddle { -namespace operators { - -class NumberCountOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("numbers"), "Input", "numbers", "NumberCount"); - OP_INOUT_CHECK( - ctx->HasOutput("Out"), "Output", "number_count", "NumberCount"); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - // the dtype of the numbers should be same as int64 - auto number_dtype = OperatorWithKernel::IndicateVarDataType(ctx, "numbers"); - - PADDLE_ENFORCE_EQ(number_dtype, - framework::proto::VarType::INT64, - phi::errors::InvalidArgument( - "The dtype of the number_dtype should be int64")); - return phi::KernelKey(number_dtype, ctx.GetPlace()); - } -}; - -class NumberCountOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("numbers", "(Tensor) The input gate index tensor."); - AddOutput("Out", "(Tensor) The output number count tensor."); - AddAttr("upper_range", "(int), The number of different numbers."); - - AddComment(R"DOC(number_count Operator.count numbers.)DOC"); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_WITHOUT_GRADIENT(number_count, - ops::NumberCountOp, - ops::NumberCountOpMaker); diff --git a/paddle/fluid/operators/number_count_op.h b/paddle/fluid/operators/number_count_op.h deleted file mode 100644 index 12ad10c3e73cc..0000000000000 --- a/paddle/fluid/operators/number_count_op.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" - -#if defined(PADDLE_WITH_GLOO) -#include "paddle/fluid/framework/fleet/gloo_wrapper.h" -#endif - -namespace paddle { -namespace operators { - -template -class NumberCountOpCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override {} -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/ops_signature/assign_pos_sig.cc b/paddle/fluid/operators/ops_signature/assign_pos_sig.cc deleted file mode 100644 index 010d164d83dae..0000000000000 --- a/paddle/fluid/operators/ops_signature/assign_pos_sig.cc +++ /dev/null @@ -1,27 +0,0 @@ -/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/core/compat/op_utils.h" - -namespace phi { - -KernelSignature AssignPosOpArgumentMapping( - const ArgumentMappingContext& ctx UNUSED) { - return KernelSignature( - "assign_pos", {"X", "cum_count", "eff_num_len"}, {}, {"Out"}); -} - -} // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(assign_pos, phi::AssignPosOpArgumentMapping); diff --git a/paddle/fluid/operators/ops_signature/decayed_adagrad_sig.cc b/paddle/fluid/operators/ops_signature/decayed_adagrad_sig.cc deleted file mode 100644 index d622a8a342789..0000000000000 --- a/paddle/fluid/operators/ops_signature/decayed_adagrad_sig.cc +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/core/compat/op_utils.h" - -namespace phi { - -KernelSignature DecayedAdagradOpArgumentMapping( - const ArgumentMappingContext& ctx UNUSED) { - return KernelSignature("decayed_adagrad", - {"Param", "Grad", "Moment", "LearningRate"}, - {"decay", "epsilon"}, - {"ParamOut", "MomentOut"}); -} - -} // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(decayed_adagrad, - phi::DecayedAdagradOpArgumentMapping); diff --git a/paddle/fluid/operators/ops_signature/fusion_group_sig.cc b/paddle/fluid/operators/ops_signature/fusion_group_sig.cc deleted file mode 100644 index 666e6f77d218f..0000000000000 --- a/paddle/fluid/operators/ops_signature/fusion_group_sig.cc +++ /dev/null @@ -1,29 +0,0 @@ -/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/core/compat/op_utils.h" - -namespace phi { - -KernelSignature FusionGroupOpArgumentMapping( - const ArgumentMappingContext& ctx) { - return KernelSignature("fusion_group", - {"Inputs"}, - {"outs_dtype", "inputs_dtype", "func_name", "type"}, - {"Outs"}); -} - -} // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(fusion_group, phi::FusionGroupOpArgumentMapping); diff --git a/paddle/fluid/operators/ops_signature/rrelu_sig.cc b/paddle/fluid/operators/ops_signature/rrelu_sig.cc deleted file mode 100644 index 18bda743e3255..0000000000000 --- a/paddle/fluid/operators/ops_signature/rrelu_sig.cc +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/core/compat/op_utils.h" - -namespace phi { - -KernelSignature RReluOpArgumentMapping( - const ArgumentMappingContext& ctx UNUSED) { - return KernelSignature( - "rrelu", {"X"}, {"lower", "upper", "is_test"}, {"Out", "Noise"}); -} - -KernelSignature RReluGradGradOpArgumentMapping( - const ArgumentMappingContext& ctx UNUSED) { - return KernelSignature( - "rrelu_grad", {"X", "Noise", "Out@GRAD"}, {}, {"X@GRAD"}); -} -} // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(rrelu, phi::RReluOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(rrelu_grad, phi::RReluGradGradOpArgumentMapping); diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc deleted file mode 100644 index 23441206a55c1..0000000000000 --- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc +++ /dev/null @@ -1,141 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_registry.h" - -#include "paddle/phi/infermeta/multiary.h" - -namespace paddle { -namespace operators { - -class DecayedAdagradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK( - ctx->HasInput("Param"), "Input", "Param", "DecayedAdagradOp"); - OP_INOUT_CHECK(ctx->HasInput("Grad"), "Input", "Grad", "DecayedAdagradOp"); - OP_INOUT_CHECK( - ctx->HasInput("Moment"), "Input", "Moment", "DecayedAdagradOp"); - OP_INOUT_CHECK(ctx->HasInput("LearningRate"), - "Input", - "LearningRate", - "DecayedAdagradOp"); - PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("Param").front(), - framework::proto::VarType::LOD_TENSOR, - phi::errors::InvalidArgument( - "The input var's type should be phi::DenseTensor, " - "but the received is %s", - ctx->Inputs("Param").front(), - ctx->GetInputsVarType("Param").front())); - PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("Grad").front(), - framework::proto::VarType::LOD_TENSOR, - phi::errors::InvalidArgument( - "The input var's type should be phi::DenseTensor, " - "but the received is %s", - ctx->Inputs("Grad").front(), - ctx->GetInputsVarType("Grad").front())); - - OP_INOUT_CHECK( - ctx->HasOutput("ParamOut"), "Output", "ParamOut", "DecayedAdagradOp"); - OP_INOUT_CHECK( - ctx->HasOutput("MomentOut"), "Output", "MomentOut", "DecayedAdagradOp"); - - auto lr_dims = ctx->GetInputDim("LearningRate"); - PADDLE_ENFORCE_NE(common::product(lr_dims), - 0, - phi::errors::InvalidArgument( - "Maybe the Input variable LearningRate has not " - "been initialized. You may need to confirm " - "if you put exe.run(startup_program) " - "after optimizer.minimize function.")); - PADDLE_ENFORCE_EQ( - common::product(lr_dims), - 1, - phi::errors::InvalidArgument("LearningRate should have one element")); - auto param_dims = ctx->GetInputDim("Param"); - PADDLE_ENFORCE_EQ( - param_dims, - ctx->GetInputDim("Grad"), - phi::errors::InvalidArgument( - "Param and Grad input of DecayedAdagradOp should have " - "the same dimension.")); - PADDLE_ENFORCE_EQ( - param_dims, - ctx->GetInputDim("Moment"), - phi::errors::InvalidArgument( - "Param and Moment input of DecayedAdagradOp should have " - "the same dimension.")); - - ctx->SetOutputDim("ParamOut", param_dims); - ctx->SetOutputDim("MomentOut", param_dims); - } - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "Param"), - ctx.GetPlace()); - } -}; - -class DecayedAdagradOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("Param", "(Tensor) Input parameter"); - AddInput("Grad", "(Tensor) Input gradient"); - AddInput("Moment", "(Tensor) Second moment"); - AddInput("LearningRate", "(Tensor) Learning rate"); - - AddOutput("ParamOut", "(Tensor) Output parameter"); - AddOutput("MomentOut", "(Tensor) Output second moment"); - - AddAttr("decay", - "(float, default 0.95) " - "Discounting factor for coming gradient") - .SetDefault(0.95); - AddAttr("epsilon", - "(float, default 1.0e-6) " - "Constant for numerical stability") - .SetDefault(1.0e-6f); - AddComment(R"DOC( -Decayed Adagrad Optimizer. - -The update is done as follows: - -$$ -moment\_out = decay * moment + (1 - decay) * grad * grad \\ -param\_out = param - \frac{learning\_rate * grad}{\sqrt{moment\_out} + epsilon} -$$ - -The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) -does not have an epsilon attribute. It is added here for numerical -stability to avoid the division by zero error. - -)DOC"); - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -DECLARE_INFER_SHAPE_FUNCTOR(decayed_adagrad, - DecayedAdagradShapeFunctor, - PD_INFER_META(phi::DecayedAdagradInferMeta)); - -REGISTER_OP_WITHOUT_GRADIENT(decayed_adagrad, - ops::DecayedAdagradOp, - ops::DecayedAdagradOpMaker, - DecayedAdagradShapeFunctor); diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.cc b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc index 1997d1fb99fd2..73ad94c0a5c6a 100644 --- a/paddle/fluid/operators/reduce_ops/logsumexp_op.cc +++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc @@ -17,7 +17,7 @@ #include #include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op_function.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/infermeta/unary.h" diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc b/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc index 2ed2e3278acad..fce12ae865173 100644 --- a/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc +++ b/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc @@ -14,7 +14,7 @@ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/reduce_ops/reduce_op_function.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/backends/xpu/xpu_header.h" diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc index ba4f188274d18..464a8e547e508 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc @@ -12,19 +12,215 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h" - #include #include #include #include #include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/infermeta/unary.h" +namespace ops = paddle::operators; namespace paddle { namespace operators { +class ReduceBaseOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ReduceBaseOp"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ReduceBaseOp"); + auto x_dims = ctx->GetInputDim("X"); + auto x_rank = x_dims.size(); + auto dims = ctx->Attrs().Get>("dim"); + PADDLE_ENFORCE_GT(dims.size(), + 0, + phi::errors::InvalidArgument( + "The input dim dimensions of ReduceBaseOp " + "should be greater than 0. But received the dim " + "dimensions of Reduce = %d.", + dims.size())); + + for (size_t i = 0; i < dims.size(); ++i) { + PADDLE_ENFORCE_LT( + dims[i], + x_rank, + phi::errors::InvalidArgument( + "The reduce dim index %d should be in the " + "range [-dimension(X), dimension(X)] " + "which dimension = %d. But received dim index = %d.", + i, + x_rank, + dims[i])); + PADDLE_ENFORCE_GE( + dims[i], + -x_rank, + phi::errors::InvalidArgument( + "The reduce dim index %d should be in the " + "range [-dimension(X), dimension(X)] " + "which dimension = %d. But received dim index = %d.", + i, + x_rank, + dims[i])); + if (dims[i] < 0) dims[i] = x_rank + dims[i]; + } + sort(dims.begin(), dims.end()); + bool reduce_all = ctx->Attrs().Get("reduce_all"); + bool keep_dim = ctx->Attrs().Get("keep_dim"); + if (reduce_all) { + if (keep_dim) + ctx->SetOutputDim("Out", + common::make_ddim(std::vector(x_rank, 1))); + else + ctx->SetOutputDim("Out", {1}); + } else { + auto dims_vector = common::vectorize(x_dims); + if (keep_dim) { + for (size_t i = 0; i < dims.size(); ++i) { + dims_vector[dims[i]] = 1; + } + } else { + const int kDelFlag = -2; + for (size_t i = 0; i < dims.size(); ++i) { + dims_vector[dims[i]] = kDelFlag; + } + dims_vector.erase( + remove(dims_vector.begin(), dims_vector.end(), kDelFlag), + dims_vector.end()); + } + if (!keep_dim && dims_vector.size() == 0) { + dims_vector.push_back(1); + } + auto out_dims = common::make_ddim(dims_vector); + ctx->SetOutputDim("Out", out_dims); + if (dims.size() > 0 && dims[0] != 0) { + // Only pass LoD when not reducing on the first dim. + ctx->ShareLoD("X", /*->*/ "Out"); + } + } + } + + // oneDNN's reduction kernel is optimized only for reducing throughout the + // most outer dims, so in case of another type of reduction, it would be + // better to fallback to native implementation + static bool HasOptimizedOneDNNKernel(const framework::ExecutionContext& ctx) { + // native reduce kernels don't support bf16 + // so oneDNN kernel is enforced in that case + if (ctx.Input("X")->dtype() == phi::DataType::BFLOAT16) + return true; + + if (!ctx.HasAttr("dim") || !ctx.HasAttr("reduce_all")) { + return false; + } + + auto reduce_dims = ctx.Attr>("dim"); + const bool reduce_all = ctx.Attr("reduce_all"); + int ndims = ctx.Input("X")->dims().size(); + + if (reduce_all) { + return true; + } + + for (size_t i = 0; i < reduce_dims.size(); ++i) { + if (reduce_dims[i] < 0) reduce_dims[i] = ndims + reduce_dims[i]; + } + sort(reduce_dims.begin(), reduce_dims.end()); + for (size_t i = 0; i < reduce_dims.size(); ++i) { + if (reduce_dims[reduce_dims.size() - i - 1] != + static_cast(ndims - i - 1)) { + return false; + } + } + + return true; + } + + phi::KernelKey GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + // choose cudnn kernel if the runtime supported. + auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); + + // NOTE(jiahongyu): Below codes originally enclosed by PADDLE_WITH_DNNL + if (ctx.Input("X")->dims().size() > 5 || + !HasOptimizedOneDNNKernel(ctx)) { + this->SetDnnFallback(true); + } + // NOTE(jiahongyu): Above codes originally enclosed by PADDLE_WITH_DNNL + + if (input_data_type == framework::proto::VarType::FP16) { + PADDLE_ENFORCE_EQ( + ctx.GetPlace().GetType() == phi::AllocationType::GPU || + ctx.GetPlace().GetType() == phi::AllocationType::XPU || + ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM, + true, + phi::errors::InvalidArgument( + "float16 can only be used on GPU or XPU place")); + } + return phi::KernelKey(input_data_type, ctx.GetPlace()); + } +}; + +class ReduceGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ReduceBaseOp"); + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), + "Input", + "Out@GRAD", + "ReduceBaseOp"); + auto x_dims = ctx->GetInputDim("X"); + auto x_rank = x_dims.size(); + // TODO(dev): We should delete Infershape and migrate it into + // UnchangeInferMeta.In case of 'dim' is Variable, it will + // not exist in Attrs but in Inputs. + if (ctx->HasAttr("dim")) { + auto dims = ctx->Attrs().Get>("dim"); + for (size_t i = 0; i < dims.size(); ++i) { + PADDLE_ENFORCE_LT( + dims[i], + x_rank, + phi::errors::InvalidArgument( + "The reduce dim index %d should be in the " + "range [-dimension(X), dimension(X)], " + "which dimension = %d. But received dim index = %d.", + i, + x_rank, + dims[i])); + if (dims[i] < 0) dims[i] = x_rank + dims[i]; + } + } + + auto x_grad_name = framework::GradVarName("X"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + ctx->ShareLoD("X", /*->*/ x_grad_name); + } + } + + protected: + phi::KernelKey GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + int out_dtype = ctx.Attr("out_dtype"); + auto input_data_type = + (out_dtype >= 0) + ? static_cast(out_dtype) + : OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")); + + // NOTE(jiahongyu): Below codes originally enclosed by PADDLE_WITH_DNNL + // max 5D tensor is supported + if (ctx.Input("X")->dims().size() > 5) { + dnn_fallback_ = true; + } + // NOTE(jiahongyu): Above codes originally enclosed by PADDLE_WITH_DNNL + + return phi::KernelKey(input_data_type, ctx.GetPlace()); + } +}; // NOTE(dengkaipeng): Input(Out) is unnecessary in reduce_mean_grad // calcualtion, but will incur a reduce_mean_grad op after @@ -65,6 +261,7 @@ class ReduceMeanDoubleGradDescMaker : public framework::GradOpDescMakerBase { return ops; } }; + class ReduceMeanDoubleGradOpBaseMaker : public imperative::GradOpBaseMakerBase { public: using imperative::GradOpBaseMakerBase::GradOpBaseMakerBase; @@ -89,6 +286,56 @@ class ReduceMeanDoubleGradOpBaseMaker : public imperative::GradOpBaseMakerBase { } }; DECLARE_NO_NEED_BUFFER_VARS_INFERER(ReduceMeanGradNoNeedBufferVarInferer, "X"); + +class ReduceBaseOpMaker : public paddle::framework::OpProtoAndCheckerMaker { + public: + void Make() final { + AddInput("X", + "(Tensor) The input tensor. Tensors with rank at most 6 are " + "supported."); + AddOutput("Out", "(Tensor) The result tensor."); + AddAttr>( + "dim", + "(list, default {0}) The dimensions to reduce. " + "Must be in the range [-rank(input), rank(input)). " + "If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. " + "Note that reducing on the first dim will make the LoD info lost.") + .SetDefault({0}) + .SupportTensor(); + AddAttr("keep_dim", + "(bool, default false) " + "If true, retain the reduced dimension with length 1.") + .SetDefault(false); + AddAttr("reduce_all", + "(bool, default false) " + "If true, output a scalar reduced along all dimensions.") + .SetDefault(false); + AddAttr("in_dtype", + "(int, default -1)" + "The dtype of input, default value is -1, the user could not " + "set this value.") + .SetDefault(-1); + AddAttr( + "out_dtype", + "(int, default -1)" + "The dtype of output, default value is -1, the dtype is same as intput") + .SetDefault(-1); + AddComment(string::Sprintf(R"DOC( +%s Operator. + +This operator computes the %s of input tensor along the given dimension. +The result tensor has 1 fewer dimension than the input unless keep_dim is true. +If reduce_all is true, just reduce along all dimensions and output a scalar. + +)DOC", + GetOpType(), + GetName())); + } + + protected: + virtual std::string GetName() const = 0; + virtual std::string GetOpType() const = 0; +}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.h b/paddle/fluid/operators/reduce_ops/reduce_mean_op.h deleted file mode 100644 index eb82be83ba517..0000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.h +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/fluid/operators/reduce_ops/reduce_op.h" - -namespace paddle { -namespace operators { - -struct MeanFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { - y->device(place) = x->mean(dim); - } -}; - -struct MeanGradFunctor { - template - void operator()(const DeviceContext& place, - X* x, - Y* y, - DX* dx, - DY* dy, - const Dim& dim, - int size) { - dx->device(place) = dy->broadcast(dim) / dx->constant(size); - } -}; - -// TODO(zengjinle): Should refine the numeric stability of FP16 reduce_mean -// and reduce_mean_grad later. -struct FP16MeanGradFunctor { - template - void operator()(const DeviceContext& place, - X* x, - Y* y, - DX* dx, - DY* dy, - const Dim& dim, - int size) { - dx->device(place) = (dy->template cast().broadcast(dim) / - dx->template cast().constant(size)) - .template cast(); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h deleted file mode 100644 index 44a82397dcc07..0000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_op.h +++ /dev/null @@ -1,895 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include - -#include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op_function.h" -#include "paddle/phi/kernels/funcs/math_function.h" -// only can include the headers in paddle/phi/api dirs -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/phi_utils.h" -#include "paddle/phi/kernels/cpu/reduce.h" - -#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__) -#include "paddle/phi/kernels/gpu/reduce.h" -#include "paddle/phi/kernels/gpu/reduce_grad.h" -#endif - -namespace paddle { -namespace operators { - -#define HANDLE_DIM(NDIM, RDIM) \ - if (ndim == NDIM && rdim == RDIM) { \ - paddle::operators:: \ - ReduceFunctor( \ - context.template device_context(), \ - *input, \ - output, \ - dims, \ - keep_dim); \ - } - -using DDim = phi::DDim; - -inline void GetShuffledDim(const DDim& src_dims, - DDim* dst_dims, - const std::vector& reduced_dims, - std::vector* perm_axis) { - // check if it's a reduced dim - std::vector src_dims_check(src_dims.size(), false); - size_t src_size = src_dims.size(); - size_t reduce_size = reduced_dims.size(); - for (size_t i = 0; i < reduce_size; ++i) { - dst_dims->at(src_size - reduce_size + i) = src_dims[reduced_dims[i]]; - (*perm_axis)[src_size - reduce_size + i] = reduced_dims[i]; - src_dims_check[reduced_dims[i]] = true; - } - - size_t offset = 0; - for (size_t i = 0; i < src_dims_check.size(); ++i) { - bool is_reduced = src_dims_check[i]; - if (!is_reduced) { - (*perm_axis)[offset] = i; - dst_dims->at(offset++) = src_dims[i]; - } - } -} - -static inline std::vector GetReduceDim(const std::vector& dims, - int dim_size, - bool reduce_all) { - std::vector reduce_dims; - if (reduce_all) { - reduce_dims.resize(dim_size); - int reduce_size = reduce_dims.size(); - for (int i = 0; i < reduce_size; ++i) { - reduce_dims[i] = i; - } - } else { - for (auto e : dims) { - PADDLE_ENFORCE_LT(e, - dim_size, - phi::errors::InvalidArgument( - "ReduceBaseOp: invalid axis, when x_dims is %d, " - "axis[i] should less than x_dims, but got %d.", - dim_size, - e)); - reduce_dims.push_back(e >= 0 ? e : e + dim_size); - } - } - return reduce_dims; -} -template -void GetShuffledInput(const framework::ExecutionContext& context, - const phi::DenseTensor* input, - phi::DenseTensor* shuffled_input, - const std::vector& dims) { - DDim shuffled_dims(input->dims()); - std::vector perm_axis(input->dims().size()); - GetShuffledDim(input->dims(), &shuffled_dims, dims, &perm_axis); - - shuffled_input->Resize(shuffled_dims); - shuffled_input->mutable_data(context.GetPlace()); - - phi::funcs::TransposeNormal trans; - trans(context.template device_context(), - *input, - shuffled_input, - perm_axis); -} - -inline void GetOriginDimFromShuffled(const DDim& src_dim, - const std::vector& dims, - std::vector* origin_dim) { - DDim shuffled_dims(src_dim); - size_t n = src_dim.size(); - std::vector perm_axis(n); - GetShuffledDim(src_dim, &shuffled_dims, dims, &perm_axis); - for (size_t i = 0; i < n; ++i) { - (*origin_dim)[perm_axis[i]] = i; - } -} - -template -void HandleLargeDim(const framework::ExecutionContext& context, - const phi::DenseTensor* input, - phi::DenseTensor* output, - const std::vector& dims, - bool keep_dim) { - // shuffle the reduced dim to the end - phi::DenseTensor shuffled_input; - GetShuffledInput(context, input, &shuffled_input, dims); - - // transpose to 2D tensor whose shape is {unreduced, reduced}. - const int64_t unreduced = output->numel(); - const int64_t input_numel = shuffled_input.numel(); - // assume: 0 / 0 == 0, which allow process 0 dim tensor - const int64_t reduced = (unreduced != 0) ? (input_numel / unreduced) : 0; - - PADDLE_ENFORCE_EQ( - unreduced * reduced, - input_numel, - phi::errors::InvalidArgument( - "Reducing failed in HandleLargeDim, when try to transpose (%d) " - "operands into 2D tensor with shape (%d, %d).", - input_numel, - unreduced, - reduced)); - - shuffled_input.Resize({unreduced, reduced}); - - DDim output_dim = output->dims(); - output->Resize({unreduced}); - paddle::operators::ReduceFunctor( - context.template device_context(), - shuffled_input, - output, - {1}, - keep_dim); - output->Resize(output_dim); -} - -template -void HandleLargeDimGrad(const framework::ExecutionContext& context, - const phi::DenseTensor* x, - const phi::DenseTensor* out, - const phi::DenseTensor* dout, - phi::DenseTensor* dx, - Functor functor, - const std::vector& dims) { - const int64_t unreduced = out->numel(); - const int64_t x_numel = x->numel(); - // assume: 0 / 0 == 0, which allow process 0 dim tensor - const int64_t reduced = (unreduced != 0) ? (x_numel / unreduced) : 0; - - PADDLE_ENFORCE_EQ( - unreduced * reduced, - x_numel, - phi::errors::InvalidArgument( - "Reducing failed in HandleLargeDimGrad, when try to transpose (%d) " - "operands into 2D tensor with shape (%d, %d).", - x_numel, - unreduced, - reduced)); - - DDim out_dim(out->dims()); - DDim x_dim(x->dims()); - // transpose and reshape X - phi::DenseTensor shuffled_x; - GetShuffledInput(context, x, &shuffled_x, dims); - DDim shuffled_dim = shuffled_x.dims(); - shuffled_x.Resize({unreduced, reduced}); - // reshape dX {unreduced, reduced} - dx->Resize({unreduced, reduced}); - ReduceGradFunctor( - context.template device_context(), - shuffled_x, - *out, - *dout, - dx, - functor, - {1}); - // transpose dX - std::vector origin_axis(x_dim.size()); - GetOriginDimFromShuffled(x_dim, dims, &origin_axis); - phi::DenseTensor dx_tmp; - framework::TensorCopy(*dx, context.GetPlace(), &dx_tmp); - dx_tmp.Resize(shuffled_dim); - dx->Resize(x_dim); - phi::funcs::TransposeNormal trans; - trans(context.template device_context(), - dx_tmp, - dx, - origin_axis); -} - -template -struct ReduceKernelFunctor { - const phi::DenseTensor* input; - phi::DenseTensor* output; - std::vector dims; - bool keep_dim; - bool reduce_all; - const framework::ExecutionContext& context; - ReduceKernelFunctor(const phi::DenseTensor* input, - phi::DenseTensor* output, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - const framework::ExecutionContext& context) - : input(input), - output(output), - dims(dims), - keep_dim(keep_dim), - reduce_all(reduce_all), - context(context) {} - - template - void apply() const { - output->mutable_data(context.GetPlace()); - if (reduce_all) { - // Flatten and reduce 1-D tensor - auto x = EigenVector::Flatten(*input); - auto out = EigenScalar::From(*output); - auto& place = - *context.template device_context().eigen_device(); - auto reduce_dim = Eigen::array({{0}}); - Functor functor; - functor(place, &x, &out, reduce_dim); - } else { - int ndim = input->dims().size(); - int rdim = dims.size(); - if (ndim > 6) { - HandleLargeDim( - context, input, output, dims, keep_dim); - } else { - HANDLE_DIM(6, 5); - HANDLE_DIM(6, 4); - HANDLE_DIM(6, 3); - HANDLE_DIM(6, 2); - HANDLE_DIM(6, 1); - HANDLE_DIM(5, 4); - HANDLE_DIM(5, 3); - HANDLE_DIM(5, 2); - HANDLE_DIM(5, 1); - HANDLE_DIM(4, 3); - HANDLE_DIM(4, 2); - HANDLE_DIM(4, 1); - HANDLE_DIM(3, 2); - HANDLE_DIM(3, 1); - HANDLE_DIM(2, 1); - HANDLE_DIM(1, 1); - } - } - } -}; -template -class ReduceKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - bool reduce_all = context.Attr("reduce_all"); - auto* output = context.Output("Out"); - auto dims = context.Attr>("dim"); - bool keep_dim = context.Attr("keep_dim"); - int out_dtype = context.Attr("out_dtype"); - framework::proto::VarType::Type cast_out_dtype; - auto* input = context.Input("X"); - - if (out_dtype < 0) { - cast_out_dtype = static_cast( - framework::TransToProtoVarType(input->dtype())); - } else { - cast_out_dtype = static_cast(out_dtype); - } - - auto& dev_ctx = context.device_context(); - output->mutable_data( - dev_ctx.GetPlace(), - static_cast(cast_out_dtype)); - - std::vector tmp_dims(dims.begin(), dims.end()); - - // call new kernel - phi::Reduce::TYPE, - T, - Functor>( - static_cast::TYPE&>(dev_ctx), - *input, - reduce_all, - tmp_dims, - keep_dim, - framework::TransToPhiDataType(cast_out_dtype), - output); - } -}; - -template -void LaunchReduceGradKernel(const framework::ExecutionContext& context, - const phi::DenseTensor* input0, - const phi::DenseTensor* input1, - const phi::DenseTensor* input2, - phi::DenseTensor* output, - Functor functor, - const std::vector& dims, - bool reduce_all = false) { - if (reduce_all) { - auto x = EigenVector::Flatten(*input0); - auto x_reduce = EigenVector::Flatten(*input1); - auto x_reduce_grad = EigenVector::Flatten(*input2); - auto x_grad = EigenVector::Flatten(*output); - auto& place = - *context.template device_context().eigen_device(); - auto broadcast_dim = - Eigen::array({{static_cast(input0->numel())}}); - functor(place, - &x, - &x_reduce, - &x_grad, - &x_reduce_grad, - broadcast_dim, - broadcast_dim[0]); - } else { - int rank = input0->dims().size(); - switch (rank) { - case 1: - ReduceGradFunctor( - context.template device_context(), - *input0, - *input1, - *input2, - output, - functor, - dims); - break; - case 2: - ReduceGradFunctor( - context.template device_context(), - *input0, - *input1, - *input2, - output, - functor, - dims); - break; - case 3: - ReduceGradFunctor( - context.template device_context(), - *input0, - *input1, - *input2, - output, - functor, - dims); - break; - case 4: - ReduceGradFunctor( - context.template device_context(), - *input0, - *input1, - *input2, - output, - functor, - dims); - break; - case 5: - ReduceGradFunctor( - context.template device_context(), - *input0, - *input1, - *input2, - output, - functor, - dims); - break; - case 6: - ReduceGradFunctor( - context.template device_context(), - *input0, - *input1, - *input2, - output, - functor, - dims); - break; - default: - HandleLargeDimGrad( - context, input0, input1, input2, output, functor, dims); - break; - } - } -} - -template -class ReduceGradKernel : public framework::OpKernel { - public: - void ComputeFromInput(const phi::DenseTensor* input2, - const framework::ExecutionContext& context) const { - bool reduce_all = context.Attr("reduce_all"); - auto dims = context.Attr>("dim"); - auto* input0 = context.Input("X"); - auto* input1 = context.Input("Out"); - - auto* output = - context.Output(framework::GradVarName("X")); - output->mutable_data(context.GetPlace()); - - // The dims has full dim, set the reduce_all is True - const auto& input_dim_size = - context.Input("X")->dims().size(); - std::set dims_set(dims.begin(), dims.end()); - bool full_dim = true; - for (auto i = 0; i < input_dim_size; i++) { - if (dims_set.find(i) == dims_set.end()) { - full_dim = false; - break; - } - } - reduce_all = (reduce_all || full_dim); - // NOTE: EigenTensor::From() uses tensor->data() - // if op has NoNeedBufferVarsInferer, the corresponding kNoNeedBufferX or - // kNoNeedBufferY should set true - // and use fake var that has same dims. - if (kNoNeedBufferX) { - input0 = output; - } - if (kNoNeedBufferY) { - input1 = input2; - } - - const std::vector const_dims = dims; - - // NOTE(dengkaipeng): Out is unnecessary in some reduce kernel and - // not be set as Input in grad Maker, use Out_grad to replace here - if (!input1) input1 = input2; - Functor functor; - LaunchReduceGradKernel(context, - input0, - input1, - input2, - output, - functor, - const_dims, - reduce_all); - } - - void Compute(const framework::ExecutionContext& context) const override { - int in_dtype = context.Attr("in_dtype"); - if (in_dtype >= 0) { - phi::DenseTensor tmp_tensor; - auto* pre_input = - context.Input(framework::GradVarName("Out")); - auto in_kernel_type = - phi::KernelKey(framework::TransToProtoVarType(pre_input->dtype()), - context.GetPlace()); - auto out_kernel_type = - phi::KernelKey(static_cast(in_dtype), - context.GetPlace()); - framework::TransDataType( - in_kernel_type, out_kernel_type, *pre_input, &tmp_tensor); - ComputeFromInput(&tmp_tensor, context); - - } else { - auto* input2 = - context.Input(framework::GradVarName("Out")); - ComputeFromInput(input2, context); - } - } -}; - -class ReduceBaseOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ReduceBaseOp"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ReduceBaseOp"); - auto x_dims = ctx->GetInputDim("X"); - auto x_rank = x_dims.size(); - auto dims = ctx->Attrs().Get>("dim"); - PADDLE_ENFORCE_GT(dims.size(), - 0, - phi::errors::InvalidArgument( - "The input dim dimensions of ReduceBaseOp " - "should be greater than 0. But received the dim " - "dimensions of Reduce = %d.", - dims.size())); - - for (size_t i = 0; i < dims.size(); ++i) { - PADDLE_ENFORCE_LT( - dims[i], - x_rank, - phi::errors::InvalidArgument( - "The reduce dim index %d should be in the " - "range [-dimension(X), dimension(X)] " - "which dimension = %d. But received dim index = %d.", - i, - x_rank, - dims[i])); - PADDLE_ENFORCE_GE( - dims[i], - -x_rank, - phi::errors::InvalidArgument( - "The reduce dim index %d should be in the " - "range [-dimension(X), dimension(X)] " - "which dimension = %d. But received dim index = %d.", - i, - x_rank, - dims[i])); - if (dims[i] < 0) dims[i] = x_rank + dims[i]; - } - sort(dims.begin(), dims.end()); - bool reduce_all = ctx->Attrs().Get("reduce_all"); - bool keep_dim = ctx->Attrs().Get("keep_dim"); - if (reduce_all) { - if (keep_dim) - ctx->SetOutputDim("Out", - common::make_ddim(std::vector(x_rank, 1))); - else - ctx->SetOutputDim("Out", {1}); - } else { - auto dims_vector = common::vectorize(x_dims); - if (keep_dim) { - for (size_t i = 0; i < dims.size(); ++i) { - dims_vector[dims[i]] = 1; - } - } else { - const int kDelFlag = -2; - for (size_t i = 0; i < dims.size(); ++i) { - dims_vector[dims[i]] = kDelFlag; - } - dims_vector.erase( - remove(dims_vector.begin(), dims_vector.end(), kDelFlag), - dims_vector.end()); - } - if (!keep_dim && dims_vector.size() == 0) { - dims_vector.push_back(1); - } - auto out_dims = common::make_ddim(dims_vector); - ctx->SetOutputDim("Out", out_dims); - if (dims.size() > 0 && dims[0] != 0) { - // Only pass LoD when not reducing on the first dim. - ctx->ShareLoD("X", /*->*/ "Out"); - } - } - } - - // oneDNN's reduction kernel is optimized only for reducing throughout the - // most outer dims, so in case of another type of reduction, it would be - // better to fallback to native implementation - static bool HasOptimizedOneDNNKernel(const framework::ExecutionContext& ctx) { - // native reduce kernels don't support bf16 - // so oneDNN kernel is enforced in that case - if (ctx.Input("X")->dtype() == phi::DataType::BFLOAT16) - return true; - - if (!ctx.HasAttr("dim") || !ctx.HasAttr("reduce_all")) { - return false; - } - - auto reduce_dims = ctx.Attr>("dim"); - const bool reduce_all = ctx.Attr("reduce_all"); - int ndims = ctx.Input("X")->dims().size(); - - if (reduce_all) { - return true; - } - - for (size_t i = 0; i < reduce_dims.size(); ++i) { - if (reduce_dims[i] < 0) reduce_dims[i] = ndims + reduce_dims[i]; - } - sort(reduce_dims.begin(), reduce_dims.end()); - for (size_t i = 0; i < reduce_dims.size(); ++i) { - if (reduce_dims[reduce_dims.size() - i - 1] != - static_cast(ndims - i - 1)) { - return false; - } - } - - return true; - } - - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - // choose cudnn kernel if the runtime supported. - auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); - - // NOTE(jiahongyu): Below codes originally enclosed by PADDLE_WITH_DNNL - if (ctx.Input("X")->dims().size() > 5 || - !HasOptimizedOneDNNKernel(ctx)) { - this->SetDnnFallback(true); - } - // NOTE(jiahongyu): Above codes originally enclosed by PADDLE_WITH_DNNL - - if (input_data_type == framework::proto::VarType::FP16) { - PADDLE_ENFORCE_EQ( - ctx.GetPlace().GetType() == phi::AllocationType::GPU || - ctx.GetPlace().GetType() == phi::AllocationType::XPU || - ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM, - true, - phi::errors::InvalidArgument( - "float16 can only be used on GPU or XPU place")); - } - return phi::KernelKey(input_data_type, ctx.GetPlace()); - } -}; - -class ReduceOpUseInputPlace : public ReduceBaseOp { - public: - using ReduceBaseOp::ReduceBaseOp; - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - phi::KernelKey kt = OperatorWithKernel::GetExpectedKernelType(ctx); - kt.set_backend( - phi::TransToPhiBackend(ctx.Input("X")->place())); - return kt; - } -}; - -class ReduceGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ReduceBaseOp"); - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), - "Input", - "Out@GRAD", - "ReduceBaseOp"); - auto x_dims = ctx->GetInputDim("X"); - auto x_rank = x_dims.size(); - // TODO(dev): We should delete Infershape and migrate it into - // UnchangeInferMeta.In case of 'dim' is Variable, it will - // not exist in Attrs but in Inputs. - if (ctx->HasAttr("dim")) { - auto dims = ctx->Attrs().Get>("dim"); - for (size_t i = 0; i < dims.size(); ++i) { - PADDLE_ENFORCE_LT( - dims[i], - x_rank, - phi::errors::InvalidArgument( - "The reduce dim index %d should be in the " - "range [-dimension(X), dimension(X)], " - "which dimension = %d. But received dim index = %d.", - i, - x_rank, - dims[i])); - if (dims[i] < 0) dims[i] = x_rank + dims[i]; - } - } - - auto x_grad_name = framework::GradVarName("X"); - if (ctx->HasOutput(x_grad_name)) { - ctx->SetOutputDim(x_grad_name, x_dims); - ctx->ShareLoD("X", /*->*/ x_grad_name); - } - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - int out_dtype = ctx.Attr("out_dtype"); - auto input_data_type = - (out_dtype >= 0) - ? static_cast(out_dtype) - : OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")); - - // NOTE(jiahongyu): Below codes originally enclosed by PADDLE_WITH_DNNL - // max 5D tensor is supported - if (ctx.Input("X")->dims().size() > 5) { - dnn_fallback_ = true; - } - // NOTE(jiahongyu): Above codes originally enclosed by PADDLE_WITH_DNNL - - return phi::KernelKey(input_data_type, ctx.GetPlace()); - } -}; - -class ReduceBaseOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() final { - AddInput("X", - "(Tensor) The input tensor. Tensors with rank at most 6 are " - "supported."); - AddOutput("Out", "(Tensor) The result tensor."); - AddAttr>( - "dim", - "(list, default {0}) The dimensions to reduce. " - "Must be in the range [-rank(input), rank(input)). " - "If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. " - "Note that reducing on the first dim will make the LoD info lost.") - .SetDefault({0}) - .SupportTensor(); - AddAttr("keep_dim", - "(bool, default false) " - "If true, retain the reduced dimension with length 1.") - .SetDefault(false); - AddAttr("reduce_all", - "(bool, default false) " - "If true, output a scalar reduced along all dimensions.") - .SetDefault(false); - AddAttr("in_dtype", - "(int, default -1)" - "The dtype of input, default value is -1, the user could not " - "set this value.") - .SetDefault(-1); - AddAttr( - "out_dtype", - "(int, default -1)" - "The dtype of output, default value is -1, the dtype is same as intput") - .SetDefault(-1); - AddComment(string::Sprintf(R"DOC( -%s Operator. - -This operator computes the %s of input tensor along the given dimension. -The result tensor has 1 fewer dimension than the input unless keep_dim is true. -If reduce_all is true, just reduce along all dimensions and output a scalar. - -)DOC", - GetOpType(), - GetName())); - } - - protected: - virtual std::string GetName() const = 0; - virtual std::string GetOpType() const = 0; -}; - -#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__) -template - class ReduceBaseOp, - template - class TransformOp> -class ReduceCudaKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - bool reduce_all = context.Attr("reduce_all"); - const phi::DenseTensor* input = context.Input("X"); - phi::DenseTensor* output = context.Output("Out"); - auto out_dtype = context.Attr("out_dtype"); - auto pt_out_dtype = paddle::framework::TransToPhiDataType( - static_cast(out_dtype)); - std::vector dims = context.Attr>("dim"); -#ifdef PADDLE_WITH_XPU_KP - auto& dev_ctx = context.template device_context(); -#else - auto& dev_ctx = context.cuda_device_context(); -#endif - if (out_dtype >= 0) { - output->mutable_data(dev_ctx.GetPlace(), pt_out_dtype); - } else { - output->mutable_data(dev_ctx.GetPlace(), input->dtype()); - } - - std::vector dims_int64{dims.begin(), dims.end()}; - - phi::Reduce( - dev_ctx, *input, reduce_all, dims_int64, false, pt_out_dtype, output); - } -}; - -#ifndef PADDLE_WITH_XPU_KP -template class TransformOp> -class ReduceCudaGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - bool reduce_all = context.Attr("reduce_all"); - std::vector dims = context.Attr>("dim"); - auto* in_x = context.Input("X"); - - auto* d_out = - context.Input(framework::GradVarName("Out")); - auto* d_x = context.Output(framework::GradVarName("X")); - auto out_dtype = context.Attr("in_dtype"); - auto pt_out_dtype = framework::TransToPhiDataType( - static_cast(out_dtype)); - // get reduce_dim and reduce_num for reduce_mean_grad - int dim_size = in_x->dims().size(); - std::vector reduce_dims = GetReduceDim(dims, dim_size, reduce_all); - auto update_dims = common::vectorize(d_x->dims()); - int reduce_num = 1; - for (auto i : reduce_dims) { - reduce_num *= (in_x->dims())[i]; - update_dims[i] = 1; - } - // make new tensor - phi::DenseTensor new_d_out(d_out->type()); - new_d_out.ShareDataWith(*d_out); - new_d_out.Resize(common::make_ddim(update_dims)); - auto& dev_ctx = context.cuda_device_context(); - if (out_dtype > 0) { - d_x->mutable_data(dev_ctx.GetPlace(), pt_out_dtype); - } else { - d_x->mutable_data(dev_ctx.GetPlace(), d_out->dtype()); - } - auto pt_d_out = std::make_unique(new_d_out); - auto pt_d_x = std::make_unique(*d_x); - if (out_dtype <= 0) { - pt_out_dtype = d_out->dtype(); - } - - using MPType = typename phi::dtype::MPTypeTrait::Type; - phi::ReduceGrad>(dev_ctx, - pt_d_out.get(), - pt_d_x.get(), - pt_out_dtype, - TransformOp(reduce_num)); - } -}; - -template -struct EqualFunctor { - inline T initial() { return static_cast(0.0f); } - - inline HOSTDEVICE T operator()(const T a, const T b) const { - return static_cast(a == b); - } -}; - -template -struct DivideFunctor { - inline T initial() { return static_cast(1.0f); } - - inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; } -}; -#endif -#endif - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -#define REGISTER_REDUCE_OP(op_name) \ - class __##op_name##Maker__ : public ops::ReduceBaseOpMaker { \ - protected: \ - virtual std::string GetName() const { return #op_name; } \ - virtual std::string GetOpType() const { return "Reduce " #op_name; } \ - }; \ - REGISTER_OPERATOR( \ - op_name, \ - ops::ReduceBaseOp, \ - __##op_name##Maker__, \ - paddle::framework::DefaultGradOpMaker, \ - paddle::framework::DefaultGradOpMaker); \ - REGISTER_OPERATOR(op_name##_grad, ops::ReduceGradOp) - -#define REGISTER_REDUCE_OP_WITHOUT_GRAD(op_name, ...) \ - class __##op_name##Maker__ : public ops::ReduceBaseOpMaker { \ - protected: \ - virtual std::string GetName() const { return #op_name; } \ - virtual std::string GetOpType() const { return "Reduce " #op_name; } \ - }; \ - REGISTER_OPERATOR( \ - op_name, \ - ops::ReduceBaseOp##__VA_ARGS__, \ - __##op_name##Maker__, \ - paddle::framework::EmptyGradOpMaker, \ - paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/reduce_ops/reduce_op_function.h b/paddle/fluid/operators/reduce_ops/reduce_op_function.h deleted file mode 100644 index b8043dcd94ba0..0000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_op_function.h +++ /dev/null @@ -1,123 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/eigen/common.h" - -namespace paddle { -namespace operators { - -using DDim = phi::DDim; -template -using EigenTensor = phi::EigenTensor; -template -using EigenScalar = phi::EigenScalar; -template -using EigenVector = phi::EigenVector; - -template -void ReduceFunctor(const DeviceContext& context, - const phi::DenseTensor& input, - phi::DenseTensor* output, - const std::vector& dims, - bool keep_dim) { - auto x = EigenTensor::From(input); - auto x_rank = static_cast(x.dimensions().size()); - auto reduce_dim = Eigen::array(); - std::vector dims_ref = dims; - for (size_t i = 0; i < dims_ref.size(); ++i) { - if (dims_ref[i] < 0) dims_ref[i] = x_rank + dims_ref[i]; - reduce_dim[i] = dims_ref[i]; - } - // construct the squeezed output tensor - DDim out_dims = output->dims(); - if (keep_dim && x_rank > 1) { - const int kDelFlag = -2; - auto dims_vector = common::vectorize(out_dims); - for (size_t i = 0; i < dims_ref.size(); ++i) { - dims_vector[dims_ref[i]] = kDelFlag; - } - dims_vector.erase(remove(dims_vector.begin(), dims_vector.end(), kDelFlag), - dims_vector.end()); - out_dims = common::make_ddim(dims_vector); - } - auto& place = *context.eigen_device(); - Functor functor; - - if (D == 1) { - auto out = EigenScalar::From(*output); - functor(place, &x, &out, reduce_dim); - } else { - auto out = EigenTensor::From(*output, out_dims); - functor(place, &x, &out, reduce_dim); - } -} - -template -void ReduceGradFunctor(const DeviceContext& context, - const phi::DenseTensor& input0, - const phi::DenseTensor& input1, - const phi::DenseTensor& input2, - phi::DenseTensor* output, - Functor functor, - const std::vector& dims) { - auto x = EigenTensor::From(input0); - auto x_grad = EigenTensor::From(*output); - auto x_rank = static_cast(x.dimensions().size()); - auto x_dims = input0.dims(); - auto reduced_dims_v = common::vectorize(x_dims); - std::vector dims_ref = dims; - Eigen::array broadcast_dim; - for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1; - - int broad_cats_times = 1; - for (size_t i = 0; i < dims_ref.size(); ++i) { - if (dims_ref[i] < 0) { - dims_ref[i] = x_rank + dims_ref[i]; - } - reduced_dims_v[dims_ref[i]] = 1; - broadcast_dim[dims_ref[i]] = x_dims[dims_ref[i]]; - broad_cats_times *= x_dims[dims_ref[i]]; - } - auto reduced_dims = common::make_ddim(reduced_dims_v); - auto x_reduce = EigenTensor::From(input1, reduced_dims); - auto x_reduce_grad = EigenTensor::From(input2, reduced_dims); - - auto& place = *context.eigen_device(); - - functor(place, - &x, - &x_reduce, - &x_grad, - &x_reduce_grad, - broadcast_dim, - broad_cats_times); -} - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/rrelu_op.cc b/paddle/fluid/operators/rrelu_op.cc deleted file mode 100644 index 1d11c62b56956..0000000000000 --- a/paddle/fluid/operators/rrelu_op.cc +++ /dev/null @@ -1,129 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/infermeta/unary.h" - -namespace paddle::operators { - -class RReluOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"), - ctx.GetPlace()); - } -}; - -class RReluOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "The input of RReLU op."); - AddOutput("Out", "The output of RReLU op."); - AddOutput("Noise", "The random sampled RReLU noise.") - .AsIntermediate() - .AsExtra(); - AddAttr("is_test", - "(bool, default false) Set to true for inference only, false " - "for training. Some layers may run faster when this is true.") - .SetDefault(false); - float default_lower = 1. / 8.; - AddAttr("lower", "Lower bound of the uniform distribution.") - .SetDefault(default_lower) - .AddCustomChecker([](const float& lower) { - PADDLE_ENFORCE_EQ(lower >= 0.0f && lower < 1.0f, - true, - phi::errors::InvalidArgument( - "'RRelu_lower' must be between 0.0 and 1.0.")); - }); - float defalut_upper = 1. / 3.; - AddAttr("upper", "Upper bound of the uniform distribution.") - .SetDefault(defalut_upper) - .AddCustomChecker([](const float& upper) { - PADDLE_ENFORCE_EQ(upper > 0.0f && upper <= 1.0f, - true, - phi::errors::InvalidArgument( - "'RRelu_upper' must be between 0.0 and 1.0.")); - }); - AddComment(R"DOC( -RReLU Operator. - -Applies the randomized leaky rectified liner unit function, element-wise, -as described in the paper: - -`Empirical Evaluation of Rectified Activations in Convolutional Network`_. - -The function is defined as: - -.. math:: - \text{RReLU}(x) = - \begin{cases} - x & \text{if } x \geq 0 \\ - ax & \text{ otherwise } - \end{cases} - -where :math:`a` is randomly sampled from uniform distribution -:math:`\mathcal{U}(\text{lower}, \text{upper})`. - - See: https://arxiv.org/pdf/1505.00853.pdf - -)DOC"); - } -}; - -class RReluGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; -}; - -template -class RReluGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr op) const override { - op->SetType("rrelu_grad"); - op->SetInput("X", this->Input("X")); - op->SetInput("Noise", this->Output("Noise")); - op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - } -}; - -} // namespace paddle::operators - -namespace ops = paddle::operators; -DECLARE_INFER_SHAPE_FUNCTOR(rrelu, - RReluInferShapeFunctor, - PD_INFER_META(phi::RReluInferMeta)); - -REGISTER_OPERATOR(rrelu, - ops::RReluOp, - ops::RReluOpMaker, - ops::RReluGradOpMaker, - ops::RReluGradOpMaker, - RReluInferShapeFunctor); - -DECLARE_INFER_SHAPE_FUNCTOR(rrelu_grad, - RReluGradInferShapeFunctor, - PD_INFER_META(phi::RReluGradInferMeta)); -REGISTER_OPERATOR(rrelu_grad, ops::RReluGradOp, RReluGradInferShapeFunctor); diff --git a/paddle/fluid/operators/tdm_child_op.cc b/paddle/fluid/operators/tdm_child_op.cc deleted file mode 100644 index 6e3804fcb0a92..0000000000000 --- a/paddle/fluid/operators/tdm_child_op.cc +++ /dev/null @@ -1,120 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include "paddle/fluid/operators/tdm_child_op.h" - -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace operators { -class TDMChildOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", - "X(Tensor), dtype support int32/int64, X variable is the " - "node id of TDM-Tree"); - AddInput( - "TreeInfo", - "TreeInfo(Tensor), dtype support int32/int64, it stores the node " - "information in the following format: item_id(shape=1), " - "layer_id(shape=1), parent_id(shape=1), child_id(shape=child_nums)"); - AddAttr("child_nums", - "child_nums(int)" - "The child nums of one node, if the node hasn't enough child, " - "it should padding 0 until child nums equal to child_nums"); - AddOutput("Child", - "Return the children's node_id of input node, " - "if input don't have child, return 0"); - AddOutput("LeafMask", - "LeafMask has the same shape with Child" - "If child is leaf node, LeafMask value = 1, else = 0"); - AddAttr("dtype", - "(int, default INT32) " - "Output data type.") - .SetDefault(2); - AddComment(R"DOC(" - **Tdm Child** - According to the input node_id on the given tree, return the corresponding child node_id and - whether child is a leaf node by LeafMask.")DOC"); - } -}; - -class TDMChildOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), - true, - phi::errors::InvalidArgument( - "Inputs(X) of TdmChild should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("TreeInfo"), - true, - phi::errors::InvalidArgument( - "Inputs(TreeInfo) of TdmChild should not be null.")); - - int child_nums = ctx->Attrs().Get("child_nums"); - PADDLE_ENFORCE_GT( - child_nums, - 0, - phi::errors::InvalidArgument( - "ValueError: The value of the 'child_nums' must greater than 0. " - "But received child_nums value = %d, ", - child_nums)); - - auto info_dims = ctx->GetInputDim("TreeInfo"); - auto input_dims = ctx->GetInputDim("X"); - - PADDLE_ENFORCE_EQ( - info_dims.size(), - 2, - phi::errors::InvalidArgument( - "ShapeError: The dimensions of the 'tree info' must be 2. " - "But received tree info's dimensions = %d, " - "tree info's shape = [%s].", - info_dims.size(), - info_dims)); - - auto output_dims = common::vectorize(input_dims); - output_dims.push_back(child_nums); - ctx->SetOutputDim("Child", common::make_ddim(output_dims)); - ctx->SetOutputDim("LeafMask", common::make_ddim(output_dims)); - - if (ctx->GetOutputsVarType("Child")[0] == - framework::proto::VarType::LOD_TENSOR) { - ctx->ShareLoD("X", /*->*/ "Child"); - ctx->ShareLoD("X", /*->*/ "LeafMask"); - } - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); - return phi::KernelKey(data_type, ctx.GetPlace()); - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OPERATOR( - tdm_child, - ops::TDMChildOp, - ops::TDMChildOpMaker, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/tdm_child_op.h b/paddle/fluid/operators/tdm_child_op.h deleted file mode 100644 index b645566736a9d..0000000000000 --- a/paddle/fluid/operators/tdm_child_op.h +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include "paddle/common/flags.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/mixed_vector.h" - -namespace paddle { -namespace operators {} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/tdm_sampler_op.cc b/paddle/fluid/operators/tdm_sampler_op.cc deleted file mode 100644 index db2dd6b4ced37..0000000000000 --- a/paddle/fluid/operators/tdm_sampler_op.cc +++ /dev/null @@ -1,136 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace operators { - -class TDMSamplerOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", - "X(Tensor), Input variable which" - "mapping the leaf node idx of tdm tree," - "dtype support int32/int64"); - AddInput("Travel", - "Travel(Tensor), must has the same dtype with Layer" - "Contains path information of all leaf nodes to root node," - " dtype support int32/64"); - AddInput("Layer", - "Layer(Tensor), must has the same dtype with Travel " - "Indicates which nodes are in each layer"); - AddAttr("output_positive", - "output_positive(bool)" - "Whether positive samples are included in the output") - .SetDefault(true); - AddAttr>( - "neg_samples_num_list", - "neg_samples_num_list(python:list[int], C++:vector)" - "The num of negative samples in each layer") - .SetDefault({}); - AddAttr>("layer_offset_lod", - "offset lod information of Layer") - .SetDefault({}); - AddAttr("seed", - "(int) The seed used in sampler. If it is 0, " - "the sampler will generate a seed randomly.") - .SetDefault(0); - AddAttr("dtype", - "(int, default INT32) " - "Output data type.") - .SetDefault(2); - AddOutput("Out", - "Sampling result lodTensor, with shape [batch_size, layer_num, " - "neg_num_of_layer]"); - AddOutput("Labels", - "Labels of sampling result, has the same shape with Out." - "pos samples mapping value 1, neg sample mapping value 0") - .AsDispensable(); - AddOutput( - "Mask", - "Padding flag of Sampling result, if sampling res comes from padding," - "it will be 0, else 1, lodTensor, with shape [batch_size, " - "layer_num, neg_num_of_layer]"); - AddComment(R"DOC(" - **TDM Sampler** - According to the input positive samples at leaf node, do negative sampling layer by layer on the given tree.")DOC"); - } -}; - -class TDMSamplerOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), - true, - phi::errors::InvalidArgument( - "Inputs(Input) of TdmSampler should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Travel"), - true, - phi::errors::InvalidArgument( - "Inputs(Travel) of TdmSampler should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Layer"), - true, - phi::errors::InvalidArgument( - "Inputs(Layer) of TdmSampler should not be null.")); - auto neg_samples_num_vec = - ctx->Attrs().Get>("neg_samples_num_list"); - auto output_positive_flag = ctx->Attrs().Get("output_positive"); - - int64_t sample_res_length = 0; - for (auto sample_nums : neg_samples_num_vec) { - sample_res_length += sample_nums + (int64_t)output_positive_flag; - } - - auto input_dims = ctx->GetInputDim("X"); - auto ddim = common::make_ddim({-1, sample_res_length}); - if (ctx->IsRuntime()) { - auto output_dims = common::vectorize(input_dims); - auto batch_size = output_dims[0]; - ctx->SetOutputDim("Out", - common::make_ddim({batch_size, sample_res_length})); - ctx->SetOutputDim("Labels", - common::make_ddim({batch_size, sample_res_length})); - ctx->SetOutputDim("Mask", - common::make_ddim({batch_size, sample_res_length})); - } else { - ctx->SetOutputDim("Out", ddim); - ctx->SetOutputDim("Labels", ddim); - ctx->SetOutputDim("Mask", ddim); - } - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); - return phi::KernelKey(data_type, ctx.GetPlace()); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OPERATOR( - tdm_sampler, - ops::TDMSamplerOp, - ops::TDMSamplerOpMaker, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/transfer_layout_op.cc b/paddle/fluid/operators/transfer_layout_op.cc deleted file mode 100644 index 19334ca2dad6a..0000000000000 --- a/paddle/fluid/operators/transfer_layout_op.cc +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/transfer_layout_op.h" - -#include - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/phi/core/infermeta_utils.h" -#include "paddle/phi/infermeta/unary.h" - -namespace paddle { -namespace framework { -class OpDesc; -class InferShapeContext; -template -class EmptyGradOpMaker; -} // namespace framework -namespace imperative { -class OpBase; -} // namespace imperative -} // namespace paddle - -namespace paddle { -namespace operators { - -class TransferLayoutOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - // kernel's device type is decided by input tensor place - auto *in = ctx.InputVar("X"); - auto *in_tensor = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in); - // NOTE(zhiqiu): hot fix, allow empty tensor of kMKLDNN layout to run this - // op - if (in_tensor->layout() != DataLayout::ONEDNN) { - PADDLE_ENFORCE_EQ(in_tensor->IsInitialized(), - true, - phi::errors::PreconditionNotMet( - "The tensor of Input(X) is not initialized.")); - } - auto place = - in_tensor->IsInitialized() ? in_tensor->place() : phi::CPUPlace(); - phi::DataType dtype = in_tensor->IsInitialized() ? in_tensor->dtype() - : phi::DataType::FLOAT32; - return phi::KernelKey(phi::TransToProtoVarType(dtype), place); - } - - phi::KernelKey GetKernelTypeForVar( - const std::string &var_name, - const phi::DenseTensor &tensor, - const phi::KernelKey &expected_kernel_type) const override { - return phi::KernelKey(phi::Backend::ALL_BACKEND, - expected_kernel_type.layout(), - expected_kernel_type.dtype()); - } -}; - -class TransferLayoutInferVarType : public framework::VarTypeInference { - public: - void operator()(framework::InferVarTypeContext *ctx) const override { - ctx->SyncTypeAndDataType("X", "Out"); - } -}; - -class TransferLayoutKernel { - public: - void operator()(const framework::ExecutionContext &ctx) const { - auto *x = ctx.InputVar("X"); - auto *out = ctx.OutputVar("Out"); - auto &dev_ctx = ctx.device_context(); - auto src_layout = ctx.Attr("src_layout"); - auto dst_layout = ctx.Attr("dst_layout"); - auto input_name = ctx.InputName("X"); - TransferLayoutFunctor( - x, out, dev_ctx, src_layout, dst_layout, input_name)(); - } -}; - -class TransferLayoutOpProtoMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "(phi::DenseTensor) The input Tensor"); - AddOutput("Out", - "(phi::DenseTensor) The Output Tensor with desired layout"); - // NOTE(zhiqiu): in most case, the src_layout is not needed, the op can use - // the layout - // of input X. However, in some mkldnn kernel, the src layout computed by - // GetKernelTypeForVar is different with the layout of tensor X. - AddAttr("src_layout", - "kAnyLayout = 0, kNHWC = 1, kNCHW = 2, kMKLDNN = 3, default " - "-1 means unspecified and use the tensor's layout.") - .SetDefault(-1); - AddAttr("dst_layout", - "kAnyLayout = 0, kNHWC = 1, kNCHW = 2, kMKLDNN = 3"); - AddComment(R"DOC( - TransferLayout Operator)DOC"); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -DECLARE_INFER_SHAPE_FUNCTOR(transfer_layout, - TransferLayoutInferShapeFunctor, - PD_INFER_META(phi::TransferLayoutInferMeta)); -REGISTER_OPERATOR( - transfer_layout, - ops::TransferLayoutOp, - ops::TransferLayoutOpProtoMaker, - ops::TransferLayoutInferVarType, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker, - TransferLayoutInferShapeFunctor); - -REGISTER_OP_VERSION(transfer_layout) - .AddCheckpoint(R"ROC(refine transfer_layout, add src_layout attribute)ROC", - paddle::framework::compatible::OpVersionDesc().NewAttr( - "src_layout", - "(int, the layout of the input tensor", - -1)); diff --git a/paddle/fluid/operators/transfer_layout_op.h b/paddle/fluid/operators/transfer_layout_op.h deleted file mode 100644 index 1b4ef2d1b5abb..0000000000000 --- a/paddle/fluid/operators/transfer_layout_op.h +++ /dev/null @@ -1,169 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/fluid/framework/data_layout_transform.h" -#include "paddle/fluid/framework/data_transform.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/platform/device_context.h" - -namespace phi { -class DenseTensor; -} // namespace phi - -namespace paddle { -namespace framework { -class Variable; -} // namespace framework -} // namespace paddle - -namespace paddle { -namespace operators { -using DataLayout = phi::DataLayout; - -class TransferLayoutFunctor { - public: - TransferLayoutFunctor(const framework::Variable *in, - framework::Variable *out, - const platform::DeviceContext &dev_ctx, - const int src_layout, - const int dst_layout, - std::string in_name) - : in_(in), - out_(out), - dev_ctx_(dev_ctx), - src_layout_(src_layout), - dst_layout_(dst_layout), - in_name_(in_name) {} - - void operator()() const { - auto &in_tensor = *framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_); - phi::DenseTensor out_tensor; - - auto out_layout = static_cast(dst_layout_); - out_tensor.set_layout(out_layout); - -#ifdef PADDLE_WITH_DNNL - // NOTE(zhiqiu): to handle the special case in ApplyDataTransform() in - // data_transfer.cc - auto in_layout = static_cast(src_layout_); - auto *tensor_out = out_->GetMutable(); - VLOG(4) << in_layout << "->" << out_layout << " " << in_tensor.layout(); - if (!in_tensor.IsInitialized() && in_layout == DataLayout::ONEDNN && - out_layout == DataLayout::kNHWC) { - tensor_out->Resize(in_tensor.dims()); - tensor_out->set_layout(out_layout); - phi::funcs::MatchShapeToLayout(tensor_out, in_layout, out_layout); - return; - } - if (in_layout == DataLayout::ONEDNN || out_layout == DataLayout::ONEDNN) { - PADDLE_ENFORCE_NE( - in_layout, - out_layout, - phi::errors::PreconditionNotMet( - "No layout transform needed between two oneDNN OPKernels.")); - - if (in_layout != DataLayout::ONEDNN && out_layout == DataLayout::ONEDNN) { - // Case1 - transform from Non-ONEDNN OPKernel to ONEDNN OPKernel - // Just set layout/format. No real transform occur - - auto out_format = phi::funcs::OneDNNFormatForSize( - in_tensor.dims().size(), phi::funcs::ToOneDNNFormat(in_layout)); - out_tensor.ShareDataWith(in_tensor); - // For NHWC data we need reshape of tensors as MKL-DNN - // is expecting NHWC dims description order - if (in_layout == DataLayout::kNHWC) { - VLOG(4) << "kNHWC"; - phi::funcs::MatchShapeToLayout(&out_tensor, in_layout, out_layout); - phi::OneDNNContext::tls().set_cur_paddle_data_layout(in_layout); - } - auto out_tz = out_tensor.dims().size() == 0 - ? std::vector{1} - : common::vectorize(out_tensor.dims()); - dnnl::memory::data_type in_type = - phi::funcs::ToOneDNNDataType(in_tensor.dtype()); - - dnnl::memory::desc out_mem_desc(out_tz, in_type, out_format); - out_tensor.set_mem_desc(out_mem_desc); - } else { - auto target_layout = - phi::OneDNNContext::tls().get_cur_paddle_data_layout(); - // NOTE(zhiqiu): hot fix, follow the same logic in DataCopy() in - // fetch_op.cc - if (out_layout == DataLayout::kNCHW && - in_name_ == framework::GradVarName("Filter")) { - target_layout = out_layout; - } - VLOG(4) << "TransDataLayoutFromOneDNN: " << in_layout << "->" - << target_layout; - // Case2 - transform from ONEDNN OPKernel to Non-ONEDNN OPKernel - // Do transform via ONEDNN lib - phi::funcs::TransDataLayoutFromOneDNN(in_layout, - target_layout, - in_tensor, - &out_tensor, - dev_ctx_.GetPlace()); - } - } else { - // Case3 - transform between Non-ONEDNN OPKernels - TransDataLayout(dev_ctx_, in_tensor, &out_tensor); - } -#else - // Case3 - transform between Non-ONEDNN OPKernels - TransDataLayout(dev_ctx_, in_tensor, &out_tensor); -#endif - framework::SetTensorToVariable(*in_, out_tensor, out_); - } - - private: - void TransDataLayout(const platform::DeviceContext &dev_ctx, - const phi::DenseTensor &in, - phi::DenseTensor *out) const { - PADDLE_ENFORCE_EQ( - common::arity(in.dims()), - 4, - phi::errors::InvalidArgument( - "Input dimension arity only can be 4, the input dimension is %s.", - in.dims())); - - auto src_dim = in.dims(); - std::vector dst_dim; - - auto axis = framework::GetAxis(in.layout(), out->layout()); - dst_dim.resize(axis.size()); - for (size_t i = 0; i < axis.size(); i++) { - dst_dim[i] = src_dim[axis[i]]; - } - - out->Resize(common::make_ddim(dst_dim)); - out->mutable_data(in.place(), in.type()); - - framework::VisitDataType( - framework::TransToProtoVarType(in.dtype()), - framework::CastDataLayout(&dev_ctx, axis, in, out)); - } - - const framework::Variable *in_; - framework::Variable *out_; - const platform::DeviceContext &dev_ctx_; - const int src_layout_; - const int dst_layout_; - std::string in_name_; -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/pir/dialect/CMakeLists.txt b/paddle/fluid/pir/dialect/CMakeLists.txt index f9b7948de3329..6832b8f9fff2c 100644 --- a/paddle/fluid/pir/dialect/CMakeLists.txt +++ b/paddle/fluid/pir/dialect/CMakeLists.txt @@ -79,6 +79,9 @@ set(op_src_files_tmp set(op_vjp_src_file_tmp ${op_vjp_source_file_tmp}) +set(op_cc_split_num 4) +set(bwd_op_cc_split_num 2) + # Auto code gen execute_process( COMMAND ${PYTHON_EXECUTABLE} ${op_parse_file} --op_yaml_path @@ -95,15 +98,22 @@ execute_process( --op_compat_yaml_file ${op_compat_yaml_file} --namespaces ${op_namespace} --dialect_name ${dialect_name} --op_def_h_file ${op_header_file_tmp} --op_info_file ${op_info_file_tmp} --op_def_cc_file ${op_src_files_tmp} - --op_vjp_cc_file ${op_vjp_src_file_tmp} --with_distributed - ${WITH_DISTRIBUTE}) + --op_vjp_cc_file ${op_vjp_src_file_tmp} --op_cc_split_num + ${op_cc_split_num} --bwd_op_cc_split_num ${bwd_op_cc_split_num} + --with_distributed ${WITH_DISTRIBUTE}) + +set(split_op_source_files + ${PIR_DIALECT_BINARY_DIR}/pd_op1.cc ${PIR_DIALECT_BINARY_DIR}/pd_op2.cc + ${PIR_DIALECT_BINARY_DIR}/pd_op3.cc ${PIR_DIALECT_BINARY_DIR}/pd_op4.cc) +set(split_bwd_op_source_files ${PIR_DIALECT_BINARY_DIR}/pd_op_bwd1.cc + ${PIR_DIALECT_BINARY_DIR}/pd_op_bwd2.cc) set(generated_files_pd_op "${op_header_file}" "${op_info_file}" - "${op_source_file}" + "${split_op_source_files}" + "${split_bwd_op_source_files}" "${op_vjp_source_file}" - "${bwd_op_source_file}" "${fused_op_source_file}" "${bwd_fused_op_source_file}" "${pir_op_source_file}" @@ -247,8 +257,8 @@ set(op_dialect_srcs ${CMAKE_CURRENT_SOURCE_DIR}/operator/ir/op_attribute.cc ${CMAKE_CURRENT_SOURCE_DIR}/operator/ir/op_type.cc ${op_info_file} - ${op_source_file} - ${bwd_op_source_file} + ${split_op_source_files} + ${split_bwd_op_source_files} ${fused_op_source_file} ${bwd_fused_op_source_file} ${pir_op_source_file} diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_tools.cc b/paddle/fluid/pir/dialect/distributed/ir/dist_tools.cc index 505b178a452b0..db7089e32177b 100644 --- a/paddle/fluid/pir/dialect/distributed/ir/dist_tools.cc +++ b/paddle/fluid/pir/dialect/distributed/ir/dist_tools.cc @@ -17,8 +17,7 @@ #include "paddle/common/enforce.h" #include "paddle/pir/include/core/operation.h" -namespace paddle { -namespace dialect { +namespace paddle::dialect { bool AllInputAreDist(const std::vector& inputs) { for (auto value : inputs) { @@ -159,10 +158,10 @@ pir::Attribute CreateReplicatedDistAttr(pir::Type prim_type, } return nullptr; } -pir::Type CvtToPirDistType(pir::Type prim_type, pir::Attribute dist_attr) { - if (!prim_type) return nullptr; +pir::Type CvtToPirDistType(pir::Type global_type, pir::Attribute dist_attr) { + if (!global_type) return nullptr; auto ctx = pir::IrContext::Instance(); - if (auto dense_tensor_type = prim_type.dyn_cast()) { + if (auto dense_tensor_type = global_type.dyn_cast()) { auto tensor_dist_attr = dist_attr.dyn_cast(); if (!tensor_dist_attr) { VLOG(0) << "Convert dense tensor type to dist type with attribute {" @@ -172,7 +171,7 @@ pir::Type CvtToPirDistType(pir::Type prim_type, pir::Attribute dist_attr) { "with non-empty TensorDistAttr")); } return DistDenseTensorType::get(ctx, dense_tensor_type, tensor_dist_attr); - } else if (auto vec_type = prim_type.dyn_cast()) { + } else if (auto vec_type = global_type.dyn_cast()) { auto array_attr = dist_attr.dyn_cast(); if (!array_attr) { VLOG(0) << "Convert vector type to dist type with attribute {" @@ -192,8 +191,8 @@ pir::Type CvtToPirDistType(pir::Type prim_type, pir::Attribute dist_attr) { } return pir::VectorType::get(ctx, dist_vec_type); } else { - VLOG(0) << "Convert type{" << prim_type << "} to dist type with attribute {" - << dist_attr << "}."; + VLOG(0) << "Convert type{" << global_type + << "} to dist type with attribute {" << dist_attr << "}."; PADDLE_THROW(common::errors::InvalidArgument( "Currently only support convert dense_tensor_type r vector type to " "dist.")); @@ -225,5 +224,4 @@ void CopyLeafOpToMesh(pir::Value value, ProcessMeshAttribute mesh_attr) { } } } -} // namespace dialect -} // namespace paddle +} // namespace paddle::dialect diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_tools.h b/paddle/fluid/pir/dialect/distributed/ir/dist_tools.h index a50331a8ea395..10f76a86e600d 100644 --- a/paddle/fluid/pir/dialect/distributed/ir/dist_tools.h +++ b/paddle/fluid/pir/dialect/distributed/ir/dist_tools.h @@ -37,7 +37,7 @@ pir::Attribute CvtToPirAttr(const phi::distributed::ArgDistAttr& dist_attr); pir::Attribute CreateReplicatedDistAttr(pir::Type prim_type, ProcessMeshAttribute mesh); -pir::Type CvtToPirDistType(pir::Type prim_type, pir::Attribute dist_attr); +pir::Type CvtToPirDistType(pir::Type global_type, pir::Attribute dist_attr); /// /// When the following conditions are met: diff --git a/paddle/fluid/pir/dialect/distributed/transforms/fuse_allreduce_split_to_reducescatter_pass.cc b/paddle/fluid/pir/dialect/distributed/transforms/fuse_allreduce_split_to_reducescatter_pass.cc index 4191eaa4bce50..5d1a9b87431f1 100644 --- a/paddle/fluid/pir/dialect/distributed/transforms/fuse_allreduce_split_to_reducescatter_pass.cc +++ b/paddle/fluid/pir/dialect/distributed/transforms/fuse_allreduce_split_to_reducescatter_pass.cc @@ -35,7 +35,11 @@ class FusedAllReduceSplitPattern : public paddle::drr::DrrPatternBase { const auto &c_allreduce_sum_ = pat.Op(paddle::dialect::CAllreduceSum_Op::name(), {{"ring_id", pat.Attr("ring_id")}, - {"use_calc_stream", pat.Attr("use_calc_stream")}}); + {"use_calc_stream", pat.Attr("use_calc_stream")}, + {"execution_stream", pat.Attr("execution_stream")}, + {"force_record_event", pat.Attr("force_record_event")}, + {"event_to_record", pat.Attr("event_to_record")}, + {"events_to_wait", pat.Attr("events_to_wait")}}); const auto &assign = pat.Op(paddle::dialect::AssignOp::name()); const auto &full = pat.Op(paddle::dialect::FullOp::name()); const auto &split_with_num = pat.Op(paddle::dialect::SplitWithNumOp::name(), @@ -74,7 +78,11 @@ class FusedAllReduceSplitPattern : public paddle::drr::DrrPatternBase { res.Op(paddle::dialect::CReducescatterOp::name(), {{"ring_id", pat.Attr("ring_id")}, {"nranks", pat.Attr("num")}, - {"use_calc_stream", pat.Attr("use_calc_stream")}}); + {"use_calc_stream", pat.Attr("use_calc_stream")}}, + {{"execution_stream", pat.Attr("execution_stream")}, + {"force_record_event", pat.Attr("force_record_event")}, + {"event_to_record", pat.Attr("event_to_record")}, + {"events_to_wait", pat.Attr("events_to_wait")}}); c_reducescatter({&res.Tensor("input_grad_partial")}, {&res.Tensor("out")}); } diff --git a/paddle/fluid/pir/dialect/op_generator/op_gen.py b/paddle/fluid/pir/dialect/op_generator/op_gen.py index 36d3a26f680a0..ed4b1bae54650 100644 --- a/paddle/fluid/pir/dialect/op_generator/op_gen.py +++ b/paddle/fluid/pir/dialect/op_generator/op_gen.py @@ -14,6 +14,7 @@ import argparse import logging +import math import os import pathlib import sys @@ -1130,6 +1131,21 @@ def get_mutable_attribute_grad_semantic(op_info, op_info_items): return mutable_attribute_grad_semantics +def split_ops(op_info_items: dict, cc_file, split_nums): + op_list = list(op_info_items.keys()) + ops_max_size = math.ceil(len(op_list) / split_nums) + split_op_info_items = [] + for i in range(split_nums): + split_op_info_items.append({}) + for i, op_name in enumerate(op_list): + list_idx = math.ceil((i + 1) / ops_max_size) - 1 + split_op_info_items[list_idx][op_name] = op_info_items[op_name] + split_cc_files = [] + for i in range(split_nums): + split_cc_files.append(cc_file.replace(".cc", f"{i + 1}.cc")) + return split_op_info_items, split_cc_files + + def GenOneDnnExtraAttrsDefaultValue(onednn_extra_args): INTARRAY_STR_TEMPLATE = """ pir::Attribute attr_{attr_name} = {op_attribute_type}::get(pir::IrContext::Instance(), phi::IntArray({attr})); """ @@ -2080,6 +2096,8 @@ def OpGenerator( op_info_file, op_def_cc_file, op_vjp_cc_file, + op_cc_split_num, + bwd_op_cc_split_num, onednn_yaml_file, ops_onednn_extra_yaml_file, ): @@ -2126,9 +2144,11 @@ def OpGenerator( op_infos = [] all_op_info_items = {} + new_op_def_cc_file = [] first_file = True onednn_only_op_list = [] - for yaml_file in op_yaml_files: + for idx in range(len(op_yaml_files)): + yaml_file = op_yaml_files[idx] op_yaml_items = [] with open(yaml_file, "r") as f: ops = yaml.safe_load(f) @@ -2194,13 +2214,37 @@ def OpGenerator( key_suffix = '_sp' if item.is_sparse_op else '' op_info_items[op['name'] + key_suffix] = item all_op_info_items[op['name'] + key_suffix] = item - op_infos.append(op_info_items) + + if dialect_name != "onednn_op": + cc_file = op_def_cc_file[idx] + if ( + yaml_file.split('/')[-1] == "ops.parsed.yaml" + and op_cc_split_num is not None + ): + split_op_info_items, split_cc_files = split_ops( + op_info_items, cc_file, op_cc_split_num + ) + op_infos.extend(split_op_info_items) + new_op_def_cc_file.extend(split_cc_files) + elif ( + yaml_file.split('/')[-1] == "backward.parsed.yaml" + and bwd_op_cc_split_num is not None + ): + split_op_info_items, split_cc_files = split_ops( + op_info_items, cc_file, bwd_op_cc_split_num + ) + op_infos.extend(split_op_info_items) + new_op_def_cc_file.extend(split_cc_files) + else: + op_infos.append(op_info_items) + new_op_def_cc_file.append(cc_file) if first_file: first_file = False if dialect_name == "onednn_op": op_infos = [all_op_info_items] + new_op_def_cc_file = op_def_cc_file # (3) auto code gen op_list_strs = [] declare_type_id_strs = [] @@ -2329,7 +2373,7 @@ def OpGenerator( f.write(op_info_str) # (6) write to files for xx_op.cc.tmp - for id in range(len(op_def_cc_file)): + for id in range(len(new_op_def_cc_file)): source_file_str = source_file_strs[id] for name in reversed(namespaces): source_file_str = NAMESPACE_GARD_TEMPLATE.format( @@ -2349,7 +2393,7 @@ def OpGenerator( input=source_file_str, define_type_id=define_type_id_strs[id], ) - with open(op_def_cc_file[id], 'w') as f: + with open(new_op_def_cc_file[id], 'w') as f: f.write(source_file_str) # (6) write to files for xx_vjp_op.cc.tmp @@ -2381,6 +2425,8 @@ def ParseArguments(): parser.add_argument('--op_info_file', type=str) parser.add_argument('--op_def_cc_file', type=str) parser.add_argument('--op_vjp_cc_file', type=str) + parser.add_argument('--op_cc_split_num', type=int) + parser.add_argument('--bwd_op_cc_split_num', type=int) parser.add_argument('--onednn_yaml_file', type=str) parser.add_argument('--ops_onednn_extra_yaml_file', type=str) parser.add_argument('--with_distributed', type=strtobool) @@ -2403,6 +2449,8 @@ def ParseArguments(): op_info_file = args.op_info_file op_def_cc_files = args.op_def_cc_file.split(",") op_vjp_cc_file = args.op_vjp_cc_file + op_cc_split_num = args.op_cc_split_num + bwd_op_cc_split_num = args.bwd_op_cc_split_num onednn_yaml_file = args.onednn_yaml_file ops_onednn_extra_yaml_file = args.ops_onednn_extra_yaml_file @@ -2417,6 +2465,8 @@ def ParseArguments(): op_info_file, op_def_cc_files, op_vjp_cc_file, + op_cc_split_num, + bwd_op_cc_split_num, onednn_yaml_file, ops_onednn_extra_yaml_file, ) diff --git a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py index 090aab4e3c4ed..95f104b76da51 100644 --- a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py +++ b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py @@ -168,6 +168,7 @@ 'fused_elementwise_div', 'fused_elementwise_mul', 'fused_elementwise_sub', + 'fusion_group', 'fusion_seqpool_cvm_concat', 'nce', 'lars_momentum', diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc index 5c7f01606c2df..777868c691c74 100644 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc @@ -274,7 +274,7 @@ bool ConcatOpInferSymbolicShape(pir::Operation *op, SetShapeOrDataForAxis(axis); } else { pir::Value res = op->result(0); - infer_context->SetStaticShapeForValue(res); + infer_context->SetSymbolForValueByStaticShape(res); // update axis value auto res_shape = infer_context->GetShapeOrDataForValue(res); for (size_t i = 0; i < rank; ++i) { diff --git a/paddle/fluid/pir/dialect/operator/interface/layout_transformation.cc b/paddle/fluid/pir/dialect/operator/interface/layout_transformation.cc index 783e56a3c505e..53c71c3fa0122 100644 --- a/paddle/fluid/pir/dialect/operator/interface/layout_transformation.cc +++ b/paddle/fluid/pir/dialect/operator/interface/layout_transformation.cc @@ -38,6 +38,45 @@ void RewriteByInfermeta(pir::Operation* op, common::DataLayout new_layout) { } } +template <> +std::vector RelevantInputsImpl( + pir::Operation* op) { + auto concrete_op = op->dyn_cast(); + return {concrete_op.x(), concrete_op.residual()}; +} + +template <> +std::vector RelevantOutputsImpl( + pir::Operation* op) { + auto concrete_op = op->dyn_cast(); + return {concrete_op.y(), concrete_op.residual_out()}; +} + +template <> +common::DataLayout PreferLayoutImpl(pir::Operation* op) { + // Note(bukejiyu): add_group_norm_silu only supports NHWC layout now. + return common::DataLayout::NHWC; +} + +template <> +void RewriteByLayoutImpl(pir::Operation* op, + common::DataLayout new_layout) { + op->set_attribute( + "data_format", + pir::StrAttribute::get(pir::IrContext::Instance(), + common::DataLayoutToString(new_layout))); + + std::vector new_outputs = AddGroupNormSiluOp::InferMeta( + op->operands_source(), const_cast(&op->attributes())); + for (size_t i = 0; i < new_outputs.size(); ++i) { + op->result(i).set_type(new_outputs[i]); + } + + for (auto value : RelevantOutputsImpl(op)) { + SetNewLayoutForValue(value, new_layout); + } +} + template <> common::DataLayout PreferLayoutImpl(pir::Operation* op) { auto data_format_attr = op->attribute("data_format"); @@ -48,11 +87,30 @@ common::DataLayout PreferLayoutImpl(pir::Operation* op) { data_format_attr)); } - // Note(lyk): We exhibit the layout transformation for conv2d + auto concrete_op = op->dyn_cast(); + if (auto in = concrete_op.input()) { + if (auto in_type = in.type()) { + if (in_type.isa()) { + if (auto tensor_type = in_type.dyn_cast()) { + if (tensor_type.dtype().isa()) { + return common::DataLayout::NHWC; + } + } + } + } + } + + return common::StringToDataLayout(data_format_attr.AsString()); +} + +template <> +std::vector RelevantInputsImpl(pir::Operation* op) { + // Note(lyk): We exhibit the layout transformation for filter of conv2d // due to issues with its infermeta and kernel not functioning // properly in NHWC layout. However, if the FLAGS_manually_trans_conv_filter // is enabled, the transfer_layout_pass can also operate correctly. - return common::StringToDataLayout(data_format_attr.AsString()); + auto concrete_op = op->dyn_cast(); + return {concrete_op.input()}; } template <> @@ -78,6 +136,14 @@ common::DataLayout PreferLayoutImpl(pir::Operation* op) { auto original_layout = common::StringToDataLayout(data_format_attr.AsString()); + if (op->HasAttribute(kForceBackendAttr) && + op->attributes() + .at(kForceBackendAttr) + .dyn_cast() + .AsString() == "gpu") { + return common::DataLayout::NHWC; + } + auto concrete_op = op->dyn_cast(); if (auto in = concrete_op.input()) { if (auto in_type = in.type()) { @@ -124,6 +190,31 @@ void RewriteByLayoutImpl(pir::Operation* op, RewriteByInfermeta(op, new_layout); } +template <> +bool CanBeModifiedImpl(pir::Operation* op) { + auto data_format_attr = op->attribute("data_format"); + if (!data_format_attr) { + PADDLE_THROW(phi::errors::InvalidArgument( + "op (%s) should have attribute `data_format`, but got %s", + op, + data_format_attr)); + } + auto cur_layout = common::StringToDataLayout(data_format_attr.AsString()); + auto prefer_layout = PreferLayoutImpl(op); + auto can_be_modified = cur_layout != prefer_layout; + + for (auto value : RelevantOutputsImpl(op)) { + // TODO(lyk) if value was used in another block, we cannot rewrite this op + for (auto it = value.use_begin(); it != value.use_end(); ++it) { + if (it->owner()->GetParent() != op->GetParent()) { + return false; + } + } + } + + return can_be_modified; +} + template <> void RewriteByLayoutImpl(pir::Operation* op, common::DataLayout new_layout) { diff --git a/paddle/fluid/pir/dialect/operator/interface/layout_transformation.hpp b/paddle/fluid/pir/dialect/operator/interface/layout_transformation.hpp index 05719bc1dfb2f..fe0f7b440772e 100644 --- a/paddle/fluid/pir/dialect/operator/interface/layout_transformation.hpp +++ b/paddle/fluid/pir/dialect/operator/interface/layout_transformation.hpp @@ -105,9 +105,11 @@ bool CanBeModifiedImpl(pir::Operation* op) { class FusedConv2dAddActOp; OVERLOAD_PREFER_LAYOUT(FusedConv2dAddActOp); OVERLOAD_REWRITE_BY_LAYOUT(FusedConv2dAddActOp); +OVERLOAD_CAN_BE_MODIFIED(FusedConv2dAddActOp); class Conv2dOp; OVERLOAD_PREFER_LAYOUT(Conv2dOp); +OVERLOAD_RELEVANT_INPUTS(Conv2dOp); OVERLOAD_REWRITE_BY_LAYOUT(Conv2dOp); class GroupNormOp; @@ -115,6 +117,12 @@ OVERLOAD_REWRITE_BY_LAYOUT(GroupNormOp); OVERLOAD_RELEVANT_INPUTS(GroupNormOp); OVERLOAD_RELEVANT_OUTPUTS(GroupNormOp); +class AddGroupNormSiluOp; +OVERLOAD_REWRITE_BY_LAYOUT(AddGroupNormSiluOp); +OVERLOAD_PREFER_LAYOUT(AddGroupNormSiluOp); +OVERLOAD_RELEVANT_INPUTS(AddGroupNormSiluOp); +OVERLOAD_RELEVANT_OUTPUTS(AddGroupNormSiluOp); + class ReshapeOp; OVERLOAD_RELEVANT_INPUTS(ReshapeOp); OVERLOAD_RELEVANT_OUTPUTS(ReshapeOp); diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_api.cc b/paddle/fluid/pir/dialect/operator/ir/manual_api.cc index 7fb835dd01c90..2d705364b970f 100644 --- a/paddle/fluid/pir/dialect/operator/ir/manual_api.cc +++ b/paddle/fluid/pir/dialect/operator/ir/manual_api.cc @@ -71,11 +71,11 @@ void set_parameter(const pir::Value& parameter, const std::string& name) { } } -void updata_parameter(const pir::Value& parameter, const std::string& name) { +void update_parameter(const pir::Value& parameter, const std::string& name) { pir::Parameter* param = ApiBuilder::Instance().GetParameter(name); PADDLE_ENFORCE_NOT_NULL(param, phi::errors::InvalidArgument( - "Parameter %s not exist, can not updata.", name)); + "Parameter %s not exist, can not update.", name)); std::unique_ptr param_new( new pir::Parameter(nullptr, 0, parameter.type())); ApiBuilder::Instance().SetParameter(name, std::move(param_new)); diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_api.h b/paddle/fluid/pir/dialect/operator/ir/manual_api.h index 86d9b9a8245cc..7a89ae9eafaa8 100644 --- a/paddle/fluid/pir/dialect/operator/ir/manual_api.h +++ b/paddle/fluid/pir/dialect/operator/ir/manual_api.h @@ -36,7 +36,7 @@ pir::Value parameter(const std::string& name); void set_parameter(const pir::Value& parameter, const std::string& name); -void updata_parameter(const pir::Value& parameter, const std::string& name); +void update_parameter(const pir::Value& parameter, const std::string& name); void shadow_output(const pir::Value& persist_value, const std::string& name); diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op_decomp_vjp.cc b/paddle/fluid/pir/dialect/operator/ir/manual_op_decomp_vjp.cc index c1aa3d776b67e..7b15459837fd9 100644 --- a/paddle/fluid/pir/dialect/operator/ir/manual_op_decomp_vjp.cc +++ b/paddle/fluid/pir/dialect/operator/ir/manual_op_decomp_vjp.cc @@ -61,7 +61,7 @@ std::vector> StackGradOp::DecompVjp( auto stop_gradients_attr = op->attribute(kAttrStopGradients) .dyn_cast() .AsVector(); - for (size_t i = 0; i < stop_gradients[0].size(); ++i) { + for (size_t i = 0; i < stop_gradients_attr.size(); ++i) { stop_gradients[0].push_back( stop_gradients_attr[i].dyn_cast().data()); } @@ -144,24 +144,31 @@ std::vector> ConcatGradOp::DecompVjp( .dyn_cast() .data(); - VLOG(6) << "Decomp call concat_grad's backward composite rule prepare"; + VLOG(4) << "Decomp call concat_grad's backward composite rule prepare"; std::vector> stop_gradients(op->results().size()); - if (combine_op_obj_x->HasAttribute(kAttrStopGradients)) { - auto stop_gradients_attr = op->attribute(kAttrStopGradients) - .dyn_cast() - .AsVector(); - for (size_t i = 0; i < stop_gradients[0].size(); ++i) { - stop_gradients[0].push_back( - stop_gradients_attr[i].dyn_cast().data()); + auto splitop = op->results()[0].first_use().owner(); + + if (splitop->HasAttribute("current_bwd_op_stop_gradients")) { + auto stop_gradients_attr = + splitop->attribute("current_bwd_op_stop_gradients") + .dyn_cast() + .AsVector(); + for (size_t i = 0; i < stop_gradients_attr.size(); ++i) { + auto stop_gradients_attr_j = + stop_gradients_attr[i].dyn_cast().AsVector(); + for (size_t j = 0; j < stop_gradients_attr_j.size(); ++j) { + stop_gradients[0].push_back( + stop_gradients_attr_j[j].dyn_cast().data()); + } } - VLOG(4) << " stop_gradients is set "; + VLOG(4) << " op stop_gradients is set "; } else { std::vector x_grad_stop_gradient(combine_op_obj_x.inputs().size(), false); stop_gradients[0] = x_grad_stop_gradient; - VLOG(4) << " stop_gradients is not set "; + VLOG(4) << " op stop_gradients is not set "; } std::vector> tensor_res; @@ -179,6 +186,7 @@ std::vector> ConcatGradOp::DecompVjp( paddle::primitive::details::concat_grad( x, out_grad, axis, x_grad); + VLOG(4) << "Call Pir Decomposed backward op concat_grad end"; std::vector> res(tensor_res.size()); for (size_t i = 0; i < tensor_res.size(); ++i) { diff --git a/paddle/fluid/pir/dialect/operator/ir/op_attribute.cc b/paddle/fluid/pir/dialect/operator/ir/op_attribute.cc index 8a843a8881734..4eb8190eaa111 100644 --- a/paddle/fluid/pir/dialect/operator/ir/op_attribute.cc +++ b/paddle/fluid/pir/dialect/operator/ir/op_attribute.cc @@ -16,8 +16,7 @@ #include "paddle/common/enforce.h" #include "paddle/common/errors.h" -namespace paddle { -namespace dialect { +namespace paddle::dialect { const phi::IntArray &IntArrayAttribute::data() const { return storage()->GetAsKey(); } @@ -130,8 +129,7 @@ DataLayoutAttribute DataLayoutAttribute::Parse( parser.ctx, StringToDataLayoutMap().at(datalayout_token_val)); } -} // namespace dialect -} // namespace paddle +} // namespace paddle::dialect IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::IntArrayAttribute) IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::ScalarAttribute) diff --git a/paddle/fluid/pir/dialect/operator/transforms/param_to_variable.cc b/paddle/fluid/pir/dialect/operator/transforms/param_to_variable.cc index 1d93e27c59b0b..78cb8e6460769 100644 --- a/paddle/fluid/pir/dialect/operator/transforms/param_to_variable.cc +++ b/paddle/fluid/pir/dialect/operator/transforms/param_to_variable.cc @@ -21,8 +21,7 @@ #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/dense_tensor.h" -namespace paddle { -namespace dialect { +namespace paddle::dialect { std::shared_ptr ParameterConvertInterface::ParameterToVariable(pir::Parameter *parameter) { if (parameter->type().isa()) { @@ -79,7 +78,6 @@ std::unique_ptr ParameterConvertInterface::VariableToParameter( } } -} // namespace dialect -} // namespace paddle +} // namespace paddle::dialect IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::ParameterConvertInterface) diff --git a/paddle/fluid/pir/drr/src/match_context_impl.h b/paddle/fluid/pir/drr/src/match_context_impl.h index a9acb5f6ed8df..ce6911fb36ecb 100644 --- a/paddle/fluid/pir/drr/src/match_context_impl.h +++ b/paddle/fluid/pir/drr/src/match_context_impl.h @@ -17,6 +17,7 @@ #include #include +#include "glog/logging.h" #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h" #include "paddle/fluid/pir/drr/include/drr_pattern_context.h" #include "paddle/fluid/pir/drr/src/attr_type_uilts.h" @@ -100,27 +101,32 @@ class MatchContextImpl final { tensor_map_.emplace(value_name, value); } - void BindIrOperation(const OpCall* op_call, pir::Operation* op) { + bool BindIrOperation(const OpCall* op_call, pir::Operation* op) { operation_map_.emplace(op_call, op); const auto& attrs = op_call->attributes(); for (const auto& kv : attrs) { - std::visit( + bool bind_success = std::visit( [&](auto&& arg) { if constexpr (std::is_same_v, NormalAttribute>) { - PADDLE_ENFORCE( - op->HasAttribute(kv.first), - phi::errors::NotFound( - "Not found attribute [%s] in Op [%s], please check the " - "validity of the attribute name[%s].", - kv.first, - op->name(), - kv.first)); - BindIrAttr(arg.name(), op->attribute(kv.first)); + if (op->HasAttribute(kv.first)) { + BindIrAttr(arg.name(), op->attribute(kv.first)); + return true; + } } + return false; }, kv.second); + if (!bind_success) { + LOG(WARNING) << "Not found attribute [" << kv.first << "] in Op [" + << op->name() + << "], please check the " + "validity of the attribute name[" + << kv.first << "]."; + return false; + } } + return true; } private: diff --git a/paddle/fluid/pir/drr/src/rewrite_pattern.cc b/paddle/fluid/pir/drr/src/rewrite_pattern.cc index 53b7ec0c919e9..93095af050afe 100644 --- a/paddle/fluid/pir/drr/src/rewrite_pattern.cc +++ b/paddle/fluid/pir/drr/src/rewrite_pattern.cc @@ -356,7 +356,10 @@ bool DrrRewritePattern::MatchFromOutputToInput( break; } // Step 1: Bind Operation of current op to match_ctx. - source_pattern_match_ctx->BindIrOperation(drr_node, ir_node); + if (!source_pattern_match_ctx->BindIrOperation(drr_node, ir_node)) { + matched = false; + break; + } // Step 2: Bind input_tensor of current op to match_ctx. const auto& drr_input_tensors = drr_node->inputs(); @@ -391,7 +394,7 @@ bool DrrRewritePattern::MatchFromOutputToInput( ir_input_values[i].use_count()) { matched = false; VLOG(8) << drr_node->name() << " Match failed: consumers of drr intput[" - << i << "] { " << drr_node->outputs().size() + << i << "] { " << drr_input_tensors[i]->consumers().size() << " } != consumers of pir intput[" << i << "] { " << ir_input_values[i].use_count() << " }."; break; @@ -495,8 +498,9 @@ MatchContextImpl DrrRewritePattern::CreateOperations( } // set insert point - size_t max_input_op_index = 0UL; - pir::Operation* max_index_op = nullptr; + // 1. get result pattern max-idx of input op + size_t max_res_idx = 0UL; + pir::Operation* max_res_idx_op = nullptr; for (const Tensor* input : op_call.inputs()) { if (input->is_none()) { continue; @@ -506,18 +510,16 @@ MatchContextImpl DrrRewritePattern::CreateOperations( pir::Operation* ir_input_op = ir_val.defining_op(); if (op_2_temp_program_index.count(ir_input_op) == 0) { // do nothing - } else if (max_input_op_index < - op_2_temp_program_index.at(ir_input_op)) { - max_input_op_index = op_2_temp_program_index.at(ir_input_op); - max_index_op = ir_input_op; - } else if (max_input_op_index == - op_2_temp_program_index.at(ir_input_op)) { - const auto& ops_vec = temp_program[max_input_op_index]; + } else if (max_res_idx < op_2_temp_program_index.at(ir_input_op)) { + max_res_idx = op_2_temp_program_index.at(ir_input_op); + max_res_idx_op = ir_input_op; + } else if (max_res_idx == op_2_temp_program_index.at(ir_input_op)) { + const auto& ops_vec = temp_program[max_res_idx]; for (auto it = ops_vec.begin(); it != ops_vec.end(); it++) { - if (*it == max_index_op) { + if (*it == max_res_idx_op) { break; } else if (*it == ir_input_op) { - max_index_op = ir_input_op; + max_res_idx_op = ir_input_op; break; } else { // do nothing @@ -528,25 +530,29 @@ MatchContextImpl DrrRewritePattern::CreateOperations( } } } - if (max_input_op_index == 0UL) { - VLOG(6) << "Not found producer op for (" << op_call.name() << ")"; - pir::Operation* source_pattern_first_op = src_match_ctx.IrOperation( - source_pattern_graph.owned_op_call()[0].get()); - max_input_op_index = op_2_temp_program_index[source_pattern_first_op]; - rewriter.set_insertion_point(source_pattern_first_op); - } else { - rewriter.SetInsertionPointAfter(max_index_op); - } - pir::Operation* new_op = - CreateOperation(op_call, src_match_ctx, rewriter, &res_match_ctx); + // 2. get source pattern min-idx op + pir::Operation* min_src_idx_op = src_match_ctx.IrOperation( + source_pattern_graph.owned_op_call()[0].get()); + size_t min_src_idx = op_2_temp_program_index[min_src_idx_op]; + for (const auto& src_owned_op_call : source_pattern_graph.owned_op_call()) { + pir::Operation* src_owned_op = + src_match_ctx.IrOperation(src_owned_op_call.get()); + size_t src_owned_op_idx = op_2_temp_program_index[src_owned_op]; + if (min_src_idx > src_owned_op_idx) { + min_src_idx = src_owned_op_idx; + min_src_idx_op = src_owned_op; + } + } - size_t new_max_input_op_index = max_input_op_index + 1; - op_2_temp_program_index[new_op] = new_max_input_op_index; - if (new_max_input_op_index >= temp_program.size()) { - temp_program.emplace_back(); + // 3. insert new op at point max(max_res_idx+1, min_src_idx) + if (min_src_idx > max_res_idx) { + rewriter.set_insertion_point(min_src_idx_op); + } else { + rewriter.SetInsertionPointAfter(max_res_idx_op); } - temp_program[new_max_input_op_index].push_back(new_op); + + CreateOperation(op_call, src_match_ctx, rewriter, &res_match_ctx); }); return res_match_ctx; diff --git a/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc b/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc index fcbfcbb910e1e..61113f8e9dfc5 100644 --- a/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc +++ b/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc @@ -278,7 +278,7 @@ struct FlowGraph { } } - std::unordered_set nhwc_nodes; + std::unordered_set mutable_nodes; for (auto& op : *(program.block())) { auto layout_transform_iface = op.dyn_cast(); @@ -286,10 +286,14 @@ struct FlowGraph { continue; } + if (!layout_transform_iface.CanBeModified(&op)) { + continue; + } + auto prefer_layout = layout_transform_iface.PreferLayout(&op); if (prefer_layout == common::DataLayout::NHWC) { Node op_node(&op); - nhwc_nodes.insert(op_node); + mutable_nodes.insert(op_node); AddEdge(op_node, dst_node(), INF); VLOG(10) << "[PreProcess] node: " << op_node << " should be set to NHWC"; @@ -302,7 +306,7 @@ struct FlowGraph { // operation who have a dertermined layout and spread its layout to // its output and inputs recursively. std::queue q; - for (auto& n : nhwc_nodes) { + for (auto& n : mutable_nodes) { q.push(n); } std::unordered_set is_node_layout_visited; @@ -362,13 +366,14 @@ struct FlowGraph { // a point of cut edge. So we set its outputs and inputs to // immutable. Node in_node = Node(v.defining_op()); - nhwc_nodes.erase(in_node); - VLOG(10) << "erase node: " << in_node << " from nhwc set"; + mutable_nodes.erase(in_node); + VLOG(10) << "erase node: " << in_node << " from mutable set"; for (auto it = v.use_begin(); it != v.use_end(); ++it) { Node out_node(it->owner()); - nhwc_nodes.erase(out_node); - VLOG(10) << "erase node: " << out_node << " from nhwc set"; + mutable_nodes.erase(out_node); + VLOG(10) + << "erase node: " << out_node << " from mutable set"; } } return !can_be_transformed; @@ -380,8 +385,8 @@ struct FlowGraph { continue; } - VLOG(10) << "add node to nhwc set: " << node; - nhwc_nodes.insert(node); + VLOG(10) << "add node to mutable set: " << node; + mutable_nodes.insert(node); VLOG(10) << "processing node successor: " << node; @@ -403,7 +408,7 @@ struct FlowGraph { continue; } is_node_layout_visited.insert(node); - if (nhwc_nodes.count(node) == 0) { + if (mutable_nodes.count(node) == 0) { VLOG(10) << "add node to nchw set: " << node; AddEdge(src_node(), node, INF); } @@ -542,7 +547,7 @@ using Edge = FlowGraph::Edge; class TransferLayoutPass : public pir::Pass { public: - TransferLayoutPass() : pir::Pass("transfer_layout_pass", 3) {} + TransferLayoutPass() : pir::Pass("transfer_layout_pass", 2) {} bool CanApplyOn(pir::Operation* op) const override { if (!op->isa()) { diff --git a/paddle/fluid/pir/transforms/gpu/add_norm_fuse_pass.cc b/paddle/fluid/pir/transforms/gpu/add_norm_fuse_pass.cc index 35afabe3ad1dc..f8675afec6c57 100644 --- a/paddle/fluid/pir/transforms/gpu/add_norm_fuse_pass.cc +++ b/paddle/fluid/pir/transforms/gpu/add_norm_fuse_pass.cc @@ -141,11 +141,13 @@ class RmsNormFusePattern : public paddle::drr::DrrPatternBase { class AddRmsNormFusePattern : public paddle::drr::DrrPatternBase { private: const bool extra_add_; + const bool trans_extra_add_; public: - explicit AddRmsNormFusePattern(bool extra_add) : extra_add_(extra_add) {} + AddRmsNormFusePattern(bool extra_add, bool trans_extra_add) + : extra_add_(extra_add), trans_extra_add_{trans_extra_add} {} - uint32_t benefit() const override { return extra_add_ ? 2 : 1; } + uint32_t benefit() const override { return extra_add_ ? 4 : 3; } std::string name() const override { return "AddRmsNormFusePattern"; } @@ -176,7 +178,9 @@ class AddRmsNormFusePattern : public paddle::drr::DrrPatternBase { if (extra_add_) { const auto &add1 = pat.Op(paddle::dialect::AddOp::name()); pat.Tensor("add_out1") = - add1(pat.Tensor("add_out"), pat.Tensor("any_tensor")); + trans_extra_add_ + ? add1(pat.Tensor("any_tensor"), pat.Tensor("add_out")) + : add1(pat.Tensor("add_out"), pat.Tensor("any_tensor")); } paddle::drr::ResultPattern res = pat.ResultPattern(); const auto &res_rms_norm = @@ -207,11 +211,13 @@ class AddRmsNormFusePattern : public paddle::drr::DrrPatternBase { class AddLayerNormFusePattern : public paddle::drr::DrrPatternBase { private: const bool extra_add_; + const bool trans_extra_add_; public: - explicit AddLayerNormFusePattern(bool extra_add) : extra_add_(extra_add) {} + AddLayerNormFusePattern(bool extra_add, bool trans_extra_add) + : extra_add_(extra_add), trans_extra_add_{trans_extra_add} {} - uint32_t benefit() const override { return extra_add_ ? 2 : 1; } + uint32_t benefit() const override { return extra_add_ ? 4 : 3; } std::string name() const override { return "AddLayerNormFusePattern"; } void operator()(paddle::drr::DrrPatternContext *ctx) const override { @@ -231,22 +237,20 @@ class AddLayerNormFusePattern : public paddle::drr::DrrPatternBase { if (extra_add_) { const auto &add1 = pat.Op(paddle::dialect::AddOp::name()); pat.Tensor("add_out1") = - add1(pat.Tensor("add_out"), pat.Tensor("any_tensor")); + trans_extra_add_ + ? add1(pat.Tensor("any_tensor"), pat.Tensor("add_out")) + : add1(pat.Tensor("add_out"), pat.Tensor("any_tensor")); } paddle::drr::ResultPattern res = pat.ResultPattern(); const auto &cast_op_dtype = res.ComputeAttr( [](const paddle::drr::MatchContext &match_ctx) -> phi::DataType { - auto x_dtype = pir::GetDataTypeFromValue(match_ctx.Tensor("x")); - return paddle::dialect::TransToPhiDataType(x_dtype); + return phi::DataType::FLOAT32; }); - const auto &cast_op_1 = + const auto cast_1_op = res.Op(paddle::dialect::CastOp::name(), {{"dtype", cast_op_dtype}}); - res.Tensor("casted_bias") = cast_op_1(res.Tensor("bias")); - const auto &cast_op_2 = + const auto cast_2_op = res.Op(paddle::dialect::CastOp::name(), {{"dtype", cast_op_dtype}}); - res.Tensor("casted_w") = cast_op_2(res.Tensor("w")); - const auto &fuse_layer_norm = res.Op(paddle::dialect::FusedBiasResidualLayernormOp::name(), {{"epsilon", pat.Attr("epsilon")}, @@ -256,14 +260,15 @@ class AddLayerNormFusePattern : public paddle::drr::DrrPatternBase { {"quant_round_type", res.Int32Attr(0)}, {"quant_max_bound", res.Float32Attr(0.0)}, {"quant_min_bound", res.Float32Attr(0.0)}}); - + res.Tensor("w_cast") = cast_1_op(res.Tensor("w")); + res.Tensor("bias_cast") = cast_1_op(res.Tensor("bias")); fuse_layer_norm( { &res.Tensor("x"), - &res.Tensor("casted_bias"), - &res.Tensor("residual"), - &res.Tensor("casted_w"), &res.InputNoneTensor(), + &res.Tensor("residual"), + &res.Tensor("w_cast"), + &res.Tensor("bias_cast"), }, {&res.Tensor("layer_norm_out"), &res.Tensor("add_out"), @@ -272,6 +277,163 @@ class AddLayerNormFusePattern : public paddle::drr::DrrPatternBase { } }; +class AddGroupNormFusePattern : public paddle::drr::DrrPatternBase { + private: + const bool extra_add_; + const bool trans_extra_add_; + + public: + AddGroupNormFusePattern(bool extra_add, bool trans_extra_add) + : extra_add_(extra_add), trans_extra_add_{trans_extra_add} {} + + uint32_t benefit() const override { return extra_add_ ? 4 : 3; } + std::string name() const override { return "AddGroupNormFusePattern"; } + + void operator()(paddle::drr::DrrPatternContext *ctx) const override { + paddle::drr::SourcePattern pat = ctx->SourcePattern(); + const auto &add = pat.Op(paddle::dialect::AddOp::name()); + const auto &group_norm = pat.Op(paddle::dialect::GroupNormOp::name(), + {{"epsilon", pat.Attr("epsilon")}, + {"groups", pat.Attr("groups")}, + {"data_format", pat.Attr("data_format")}}); + pat.Tensor("add_out") = add(pat.Tensor("x"), pat.Tensor("residual")); + group_norm( + {&pat.Tensor("add_out"), &pat.Tensor("scale"), &pat.Tensor("bias")}, + {&pat.Tensor("group_out"), + &pat.Tensor("mean_out_0"), + &pat.Tensor("variance_out_0")}); + // TODO(bukejiyu) :DRR support matching placeholder op, + // the following needs to be deleted + if (extra_add_) { + const auto &add1 = pat.Op(paddle::dialect::AddOp::name()); + pat.Tensor("add_out1") = + trans_extra_add_ + ? add1(pat.Tensor("any_tensor"), pat.Tensor("add_out")) + : add1(pat.Tensor("add_out"), pat.Tensor("any_tensor")); + } + pat.AddConstraint([this](const paddle::drr::MatchContext &match_ctx) { + auto x_dtype = pir::GetDataTypeFromValue(match_ctx.Tensor("x")); + if (!x_dtype.isa() && + !x_dtype.isa()) { + return false; + } + return true; + }); + paddle::drr::ResultPattern res = pat.ResultPattern(); + const auto &add_group_norm_silu_op = + res.Op(paddle::dialect::AddGroupNormSiluOp::name(), + {{"epsilon", pat.Attr("epsilon")}, + {"groups", pat.Attr("groups")}, + {"data_format", pat.Attr("data_format")}, + {"activation", res.StrAttr("")}}); + + add_group_norm_silu_op({&res.Tensor("x"), + &res.Tensor("residual"), + &res.Tensor("scale"), + &res.Tensor("bias")}, + {&res.Tensor("group_out"), + &res.Tensor("add_out"), + &res.Tensor("mean_out"), + &res.Tensor("variance_out")}); + } +}; + +class AddGroupNormWithActPattern : public paddle::drr::DrrPatternBase { + public: + uint32_t benefit() const override { return 2; } + std::string name() const override { return "AddGroupNormWithActPattern"; } + + void operator()(paddle::drr::DrrPatternContext *ctx) const override { + paddle::drr::SourcePattern pat = ctx->SourcePattern(); + const auto &add_group_norm_silu_op = + pat.Op(paddle::dialect::AddGroupNormSiluOp::name(), + {{"epsilon", pat.Attr("epsilon")}, + {"groups", pat.Attr("groups")}, + {"data_format", pat.Attr("data_format")}, + {"activation", pat.Attr("activation")}}); + const auto &silu = pat.Op(paddle::dialect::SiluOp::name()); + add_group_norm_silu_op({&pat.Tensor("x"), + &pat.Tensor("residual"), + &pat.Tensor("scale"), + &pat.Tensor("bias")}, + {&pat.Tensor("group_out"), + &pat.Tensor("add_out"), + &pat.Tensor("mean_out_0"), + &pat.Tensor("variance_out_0")}); + pat.Tensor("silu_out") = silu(pat.Tensor("group_out")); + pat.AddConstraint([this](const paddle::drr::MatchContext &match_ctx) { + auto x_dtype = pir::GetDataTypeFromValue(match_ctx.Tensor("x")); + if (!x_dtype.isa() && + !x_dtype.isa()) { + return false; + } + auto activation = match_ctx.Attr("activation"); + if (activation != "") { + return false; + } + return true; + }); + paddle::drr::ResultPattern res = pat.ResultPattern(); + const auto &res_add_group_norm_silu_op = + res.Op(paddle::dialect::AddGroupNormSiluOp::name(), + {{"epsilon", pat.Attr("epsilon")}, + {"groups", pat.Attr("groups")}, + {"data_format", pat.Attr("data_format")}, + {"activation", res.StrAttr("silu")}}); + res_add_group_norm_silu_op({&res.Tensor("x"), + &res.Tensor("residual"), + &res.Tensor("scale"), + &res.Tensor("bias")}, + {&res.Tensor("silu_out"), + &res.Tensor("add_out"), + &res.Tensor("mean_out"), + &res.Tensor("variance_out")}); + } +}; + +class GroupNormWithActPattern : public paddle::drr::DrrPatternBase { + public: + uint32_t benefit() const override { return 1; } + std::string name() const override { return "GroupNormWithActPattern"; } + + void operator()(paddle::drr::DrrPatternContext *ctx) const override { + paddle::drr::SourcePattern pat = ctx->SourcePattern(); + const auto &group_norm = pat.Op(paddle::dialect::GroupNormOp::name(), + {{"epsilon", pat.Attr("epsilon")}, + {"groups", pat.Attr("groups")}, + {"data_format", pat.Attr("data_format")}}); + const auto &silu = pat.Op(paddle::dialect::SiluOp::name()); + group_norm({&pat.Tensor("x"), &pat.Tensor("scale"), &pat.Tensor("bias")}, + {&pat.Tensor("group_out"), + &pat.Tensor("mean_out_0"), + &pat.Tensor("variance_out_0")}); + pat.Tensor("silu_out") = silu(pat.Tensor("group_out")); + pat.AddConstraint([this](const paddle::drr::MatchContext &match_ctx) { + auto x_dtype = pir::GetDataTypeFromValue(match_ctx.Tensor("x")); + if (!x_dtype.isa() && + !x_dtype.isa()) { + return false; + } + return true; + }); + paddle::drr::ResultPattern res = pat.ResultPattern(); + const auto &add_group_norm_silu_op = + res.Op(paddle::dialect::AddGroupNormSiluOp::name(), + {{"epsilon", pat.Attr("epsilon")}, + {"groups", pat.Attr("groups")}, + {"data_format", pat.Attr("data_format")}, + {"activation", res.StrAttr("silu")}}); + add_group_norm_silu_op({&res.Tensor("x"), + &res.InputNoneTensor(), + &res.Tensor("scale"), + &res.Tensor("bias")}, + {&res.Tensor("silu_out"), + &res.OutputNoneTensor(), + &res.Tensor("mean_out"), + &res.Tensor("variance_out")}); + } +}; + class AddNormFusePass : public pir::PatternRewritePass { public: AddNormFusePass() : pir::PatternRewritePass("add_norm_fuse_pass", 2) {} @@ -290,13 +452,37 @@ class AddNormFusePass : public pir::PatternRewritePass { // x-------- // add-rms_norm ---> rms_norm // residual- - ps.Add(paddle::drr::Create(context, !extra_add)); - ps.Add(paddle::drr::Create(context, extra_add)); + ps.Add( + paddle::drr::Create(context, !extra_add, false)); + ps.Add( + paddle::drr::Create(context, extra_add, true)); + ps.Add( + paddle::drr::Create(context, extra_add, false)); + // x-------- // add-layer_norm ----> fused_bias_residual_layernorm // residual- - ps.Add(paddle::drr::Create(context, !extra_add)); - ps.Add(paddle::drr::Create(context, extra_add)); + ps.Add(paddle::drr::Create( + context, !extra_add, false)); + ps.Add( + paddle::drr::Create(context, extra_add, true)); + ps.Add(paddle::drr::Create( + context, extra_add, false)); + + // x-------- + // add-group_norm ----> add_group_norm_silu + // residual- + ps.Add(paddle::drr::Create( + context, !extra_add, true)); + ps.Add( + paddle::drr::Create(context, extra_add, true)); + ps.Add(paddle::drr::Create( + context, extra_add, false)); + + // add_group_norm_silu-silu --->add_group_norm_silu + ps.Add(paddle::drr::Create(context)); + // group-silu->add_group_norm_silu + ps.Add(paddle::drr::Create(context)); return ps; } }; diff --git a/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc b/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc index 96851cfeac559..754422312e47a 100644 --- a/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc +++ b/paddle/fluid/pir/transforms/gpu/conv2d_add_act_fuse_pass.cc @@ -35,8 +35,8 @@ class Conv2dAddActFusePassDrrPattern : public paddle::drr::DrrPatternBase { private: std::string act_name_; bool cutlass_pattern_; - const std::unordered_set conv2d_depthwise_act_set_ = { - "relu", "swish", "sigmoid"}; + const std::unordered_set conv2d_depthwise_act_set_ = {"relu", + "swish"}; public: static const int CUTLASS_NHWC_ALIGNMENT = 8; @@ -152,62 +152,6 @@ class Conv2dAddActFusePassDrrPattern : public paddle::drr::DrrPatternBase { [this](const paddle::drr::MatchContext &match_ctx) -> std::string { return cutlass_pattern_ ? "gpu" : "gpudnn"; }); - const auto &perm_weight_shape = res.ComputeAttr( - [this](const paddle::drr::MatchContext &match_ctx) -> std::vector { - auto data_format = match_ctx.Attr("data_format"); - if (cutlass_pattern_ || data_format == "NHWC") { - return {0, 2, 3, 1}; - } else { - return {0, 1, 2, 3}; - } - }); - const auto &perm_input_shape = res.ComputeAttr( - [this](const paddle::drr::MatchContext &match_ctx) -> std::vector { - auto data_format = match_ctx.Attr("data_format"); - if (cutlass_pattern_ && data_format == "NCHW") { - return {0, 2, 3, 1}; - } else { - return {0, 1, 2, 3}; - } - }); - const auto &perm_bias_shape = res.ComputeAttr( - [this](const paddle::drr::MatchContext &match_ctx) -> std::vector { - auto data_format = match_ctx.Attr("data_format"); - auto bias_shape = pir::GetShapeFromValue(match_ctx.Tensor("bias")); - if (cutlass_pattern_ && data_format == "NCHW") { - if (bias_shape.size() == 4) { - return {0, 2, 3, 1}; - } else if (bias_shape.size() == 3) { - return {0, 2, 1}; - } else { - return {0}; - } - } else { - std::vector dst_vector(bias_shape.size()); - std::iota(dst_vector.begin(), dst_vector.end(), 0); - return dst_vector; - } - }); - const auto &data_format_conv = res.ComputeAttr( - [this](const paddle::drr::MatchContext &match_ctx) -> std::string { - auto data_format = match_ctx.Attr("data_format"); - if (cutlass_pattern_ && data_format == "NCHW") { - return "NHWC"; - } else { - return data_format; - } - }); - // TODO(bukejiyu) When the transfer_layout_pass is supported, - // transpose_op will be deleted. - const auto &transpose_op_w = res.Op(paddle::dialect::TransposeOp::name(), - {{"perm", perm_weight_shape}}); - const auto &transpose_op_input = res.Op( - paddle::dialect::TransposeOp::name(), {{"perm", perm_input_shape}}); - const auto &transpose_op_bias = res.Op(paddle::dialect::TransposeOp::name(), - {{"perm", perm_bias_shape}}); - res.Tensor("filter_transpose") = transpose_op_w(res.Tensor("filter")); - res.Tensor("input_transpose") = transpose_op_input(res.Tensor("input")); - res.Tensor("bias_transpose") = transpose_op_bias(res.Tensor("bias")); const auto &fused_conv2d_add_act = res.Op( paddle::dialect::FusedConv2dAddActOp::name(), {{ @@ -216,7 +160,7 @@ class Conv2dAddActFusePassDrrPattern : public paddle::drr::DrrPatternBase { {"padding_algorithm", pat.Attr("padding_algorithm")}, {"dilations", pat.Attr("dilations")}, {"groups", pat.Attr("groups")}, - {"data_format", data_format_conv}, + {"data_format", pat.Attr("data_format")}, {"activation", res.StrAttr(act_name_)}, {"split_channels", res.VectorInt32Attr({})}, {"exhaustive_search", res.BoolAttr(false)}, @@ -224,24 +168,11 @@ class Conv2dAddActFusePassDrrPattern : public paddle::drr::DrrPatternBase { {"fuse_alpha", res.Float32Attr(0.0f)}, }}, {{{paddle::dialect::kForceBackendAttr, force_backend_runtime_attr}}}); - fused_conv2d_add_act({&res.Tensor("input_transpose"), - &res.Tensor("filter_transpose"), - &res.Tensor("bias_transpose"), + fused_conv2d_add_act({&res.Tensor("input"), + &res.Tensor("filter"), + &res.Tensor("bias"), &res.InputNoneTensor()}, - {&res.Tensor("fuesd_conv2d_add_act_out")}); - const auto &perm_out_shape = res.ComputeAttr( - [this](const paddle::drr::MatchContext &match_ctx) -> std::vector { - auto data_format = match_ctx.Attr("data_format"); - if (cutlass_pattern_ && data_format == "NCHW") { - return {0, 3, 1, 2}; - } else { - return {0, 1, 2, 3}; - } - }); - const auto &transpose_op_out = res.Op(paddle::dialect::TransposeOp::name(), - {{"perm", perm_out_shape}}); - res.Tensor("act_out") = - transpose_op_out(res.Tensor("fuesd_conv2d_add_act_out")); + {&res.Tensor("act_out")}); } }; @@ -278,11 +209,9 @@ class Conv2dAdd2ActFusePattern if (next_op->isa()) { act_name = "relu"; } -#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 8000 && CUDNN_VERSION < 8700 +#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 8000 && CUDNN_VERSION < 8700 if (next_op->isa()) { act_name = "tanh"; - } else if (next_op->isa()) { - act_name = "sigmoid"; } #endif if (act_name == "") { @@ -346,11 +275,10 @@ class Conv2dAddActFusePass : public pir::PatternRewritePass { paddle::dialect::FusedConv2dAddActOp::name()}); // NOTE(liuyuanle): cudnn [8.7, 8.9 now) version has bug when act is -// sigmoid/tanh. Ref to issue +// tanh. Ref to issue // https://github.com/PaddlePaddle/Paddle/issues/50853 #if CUDNN_VERSION >= 8000 && CUDNN_VERSION < 8700 - const std::unordered_set cudnn_act_set( - {"relu", "sigmoid", "tanh"}); + const std::unordered_set cudnn_act_set({"relu", "tanh"}); #else const std::unordered_set cudnn_act_set({"relu"}); #endif diff --git a/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc b/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc index 994fbdf2ce69f..89a023197a27e 100644 --- a/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc +++ b/paddle/fluid/pir/transforms/gpu/conv2d_add_fuse_pass.cc @@ -138,62 +138,6 @@ class Conv2dAddFusePattern : public paddle::drr::DrrPatternBase { [this](const paddle::drr::MatchContext &match_ctx) -> std::string { return cutlass_pattern_ ? "gpu" : "gpudnn"; }); - const auto &perm_weight_shape = res.ComputeAttr( - [this](const paddle::drr::MatchContext &match_ctx) -> std::vector { - auto data_format = match_ctx.Attr("data_format"); - if (cutlass_pattern_ || data_format == "NHWC") { - return {0, 2, 3, 1}; - } else { - return {0, 1, 2, 3}; - } - }); - const auto &perm_input_shape = res.ComputeAttr( - [this](const paddle::drr::MatchContext &match_ctx) -> std::vector { - auto data_format = match_ctx.Attr("data_format"); - if (cutlass_pattern_ && data_format == "NCHW") { - return {0, 2, 3, 1}; - } else { - return {0, 1, 2, 3}; - } - }); - const auto &perm_bias_shape = res.ComputeAttr( - [this](const paddle::drr::MatchContext &match_ctx) -> std::vector { - auto data_format = match_ctx.Attr("data_format"); - auto bias_shape = pir::GetShapeFromValue(match_ctx.Tensor("bias")); - if (cutlass_pattern_ && data_format == "NCHW") { - if (bias_shape.size() == 4) { - return {0, 2, 3, 1}; - } else if (bias_shape.size() == 3) { - return {0, 2, 1}; - } else { - return {0}; - } - } else { - std::vector dst_vector(bias_shape.size()); - std::iota(dst_vector.begin(), dst_vector.end(), 0); - return dst_vector; - } - }); - const auto &data_format_conv = res.ComputeAttr( - [this](const paddle::drr::MatchContext &match_ctx) -> std::string { - auto data_format = match_ctx.Attr("data_format"); - if (cutlass_pattern_ && data_format == "NCHW") { - return "NHWC"; - } else { - return data_format; - } - }); - // TODO(bukejiyu) When the transfer_layout_pass is supported, - // transpose_op will be deleted. - const auto &transpose_op_w = res.Op(paddle::dialect::TransposeOp::name(), - {{"perm", perm_weight_shape}}); - const auto &transpose_op_input = res.Op( - paddle::dialect::TransposeOp::name(), {{"perm", perm_input_shape}}); - const auto &transpose_op_bias = res.Op(paddle::dialect::TransposeOp::name(), - {{"perm", perm_bias_shape}}); - res.Tensor("filter_transpose") = transpose_op_w(res.Tensor("filter")); - res.Tensor("input_transpose") = transpose_op_input(res.Tensor("input")); - res.Tensor("bias_transpose") = transpose_op_bias(res.Tensor("bias")); const auto &fused_conv2d_add_act = res.Op( paddle::dialect::FusedConv2dAddActOp::name(), {{ @@ -202,7 +146,7 @@ class Conv2dAddFusePattern : public paddle::drr::DrrPatternBase { {"padding_algorithm", pat.Attr("padding_algorithm")}, {"dilations", pat.Attr("dilations")}, {"groups", pat.Attr("groups")}, - {"data_format", data_format_conv}, + {"data_format", pat.Attr("data_format")}, {"activation", res.StrAttr("identity")}, {"split_channels", res.VectorInt32Attr({})}, {"exhaustive_search", res.BoolAttr(false)}, @@ -211,25 +155,11 @@ class Conv2dAddFusePattern : public paddle::drr::DrrPatternBase { }}, {{{paddle::dialect::kForceBackendAttr, force_backend_runtime_attr}}}); - fused_conv2d_add_act( - {&res.Tensor("input_transpose"), - &res.Tensor("filter_transpose"), - &res.Tensor("bias_transpose"), - &res.InputNoneTensor()}, - {&res.Tensor("fuesd_conv2d_add_act_out"), &res.OutputNoneTensor()}); - const auto &perm_out_shape = res.ComputeAttr( - [this](const paddle::drr::MatchContext &match_ctx) -> std::vector { - auto data_format = match_ctx.Attr("data_format"); - if (cutlass_pattern_ && data_format == "NCHW") { - return {0, 3, 1, 2}; - } else { - return {0, 1, 2, 3}; - } - }); - const auto &transpose_op_out = res.Op(paddle::dialect::TransposeOp::name(), - {{"perm", perm_out_shape}}); - res.Tensor("add_out") = - transpose_op_out(res.Tensor("fuesd_conv2d_add_act_out")); + fused_conv2d_add_act({&res.Tensor("input"), + &res.Tensor("filter"), + &res.Tensor("bias"), + &res.InputNoneTensor()}, + {&res.Tensor("add_out"), &res.OutputNoneTensor()}); } }; diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc index 358d52d03d31b..d7b164862cd7e 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.cc +++ b/paddle/fluid/platform/device/gpu/gpu_info.cc @@ -68,8 +68,7 @@ PADDLE_DEFINE_EXPORTED_uint64(cuda_memory_async_pool_realease_threshold, "Amount of reserved memory in bytes to hold onto " "before trying to release memory back to the OS"); -namespace paddle { -namespace platform { +namespace paddle::platform { void GpuMemoryUsage(size_t *available, size_t *total) { size_t actual_available, actual_total; @@ -719,5 +718,4 @@ void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) { phi::backends::gpu::GpuMemsetAsync(dst, value, count, stream); } -} // namespace platform -} // namespace paddle +} // namespace paddle::platform diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h index 496b253dff5b3..980b7cb35410b 100644 --- a/paddle/fluid/platform/dynload/cublas.h +++ b/paddle/fluid/platform/dynload/cublas.h @@ -80,8 +80,14 @@ namespace dynload { __macro(cublasSgetriBatched); \ __macro(cublasDgetrfBatched); \ __macro(cublasDgetriBatched); \ + __macro(cublasCgetrfBatched); \ + __macro(cublasCgetriBatched); \ + __macro(cublasZgetrfBatched); \ + __macro(cublasZgetriBatched); \ __macro(cublasSmatinvBatched); \ __macro(cublasDmatinvBatched); \ + __macro(cublasCmatinvBatched); \ + __macro(cublasZmatinvBatched); \ __macro(cublasSgetrsBatched); \ __macro(cublasDgetrsBatched); diff --git a/paddle/fluid/platform/dynload/nccl.cc b/paddle/fluid/platform/dynload/nccl.cc index 7b0ea3bb7f3c1..ee270918b59c7 100644 --- a/paddle/fluid/platform/dynload/nccl.cc +++ b/paddle/fluid/platform/dynload/nccl.cc @@ -14,9 +14,7 @@ limitations under the License. */ #include "paddle/fluid/platform/dynload/nccl.h" -namespace paddle { -namespace platform { -namespace dynload { +namespace paddle::platform::dynload { #define DEFINE_WRAP(__name) DynLoad__##__name __name @@ -38,6 +36,4 @@ NCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP) NCCL_RAND_ROUTINE_EACH_AFTER_21100(DEFINE_WRAP) #endif -} // namespace dynload -} // namespace platform -} // namespace paddle +} // namespace paddle::platform::dynload diff --git a/paddle/fluid/prim/api/api.yaml b/paddle/fluid/prim/api/api.yaml index a951ed4431a57..61e056678d19f 100644 --- a/paddle/fluid/prim/api/api.yaml +++ b/paddle/fluid/prim/api/api.yaml @@ -38,6 +38,7 @@ - pad - sqrt - cumsum +- cumprod - put_along_axis - sin - cos diff --git a/paddle/fluid/prim/api/auto_code_generated/template/static_prim_api.cc.j2 b/paddle/fluid/prim/api/auto_code_generated/template/static_prim_api.cc.j2 index 55b65bf05163f..b1b675a78589a 100644 --- a/paddle/fluid/prim/api/auto_code_generated/template/static_prim_api.cc.j2 +++ b/paddle/fluid/prim/api/auto_code_generated/template/static_prim_api.cc.j2 @@ -1,5 +1,5 @@ {% from "utils.cc.j2" import static_prim_api %} -// Generated by /paddle/fluid/prim/api/auto_code_generated/static_gen.py. +// Generated by /paddle/fluid/prim/api/auto_code_generated/static_gen.py. // DO NOT EDIT! #include diff --git a/paddle/fluid/prim/api/auto_code_generated/template/utils.cc.j2 b/paddle/fluid/prim/api/auto_code_generated/template/utils.cc.j2 index 78a270ef37d5b..5e34af02f2857 100644 --- a/paddle/fluid/prim/api/auto_code_generated/template/utils.cc.j2 +++ b/paddle/fluid/prim/api/auto_code_generated/template/utils.cc.j2 @@ -25,7 +25,7 @@ {% endfilter %} op->CheckAttrs(); op->InferVarType(block); - op->InferShape(*block); + op->InferShape(*block); {% if outputs|length > 1 %} return std::make_tuple{{sequence('(', ')', ', ', output_names)}}; {% elif outputs|length == 1 %} @@ -56,7 +56,7 @@ template <> {%- macro static_prim_api_sig_ret(outputs) -%} {%- set names = [] -%} {%- for i in outputs -%} {%- do names.append(i.typename|to_paddle_output_type) -%} {%- endfor -%} - {%- if names|length > 1 -%} + {%- if names|length > 1 -%} std::tuple<{{sequence('', '', ', ', names)}}> {%- else -%} {{names[0]}} @@ -80,7 +80,7 @@ if ({{input.name}}) { std::transform({{input.name}}.get().begin(), {{input.name}}.get().end(), {{input.name}}_names.begin(), [](const Tensor& t) { return std::static_pointer_cast(t.impl())->Name(); }); - op->SetInput("{{input.fluid_name | to_pascal}}", {{input.name}}_names); + op->SetInput("{{input.fluid_name | to_pascal}}", {{input.name}}_names); } {%- else -%} if ({{input.name}}) { @@ -96,7 +96,7 @@ std::vector {{input.name}}_names({{input.name}}.size());; std::transform({{input.name}}.begin(), {{input.name}}.end(), {{input.name}}_names.begin(), [](const Tensor& t) { return std::static_pointer_cast(t.impl())->Name(); }); -op->SetInput("{{input.fluid_name | to_pascal}}", {{input.name}}_names); +op->SetInput("{{input.fluid_name | to_pascal}}", {{input.name}}_names); {%- else -%} op->SetInput("{{input.fluid_name | to_pascal}}", {std::static_pointer_cast({{input.name}}.impl())->Name()}); {%- endif -%} @@ -180,7 +180,7 @@ paddle::framework::TransToProtoVarType({{src_name}}) {%- set is_set = [] -%} {#- why not use boolean, ref: https://stackoverflow.com/questions/17925674/jinja2-local-global-variable -#} {%- if not is_set -%} {#- use DataType attr as default output dtype -#} {%- for attr in attrs -%} - {%- if attr.typename is datatype -%} + {%- if attr.typename is datatype -%} {{attr.name}} {%- do is_set.append(1) -%} {%- endif -%} diff --git a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h index 0465f73a44593..17bc345917064 100644 --- a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h +++ b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h @@ -744,13 +744,20 @@ void slice_grad(const Tensor& input, paddings.push_back(offsets[i]); paddings.push_back((in_dims[i] - out_dims[i]) - offsets[i]); } + Tensor reshape_out_grad; + if (out_grad.shape().size() == 0) { + reshape_out_grad = full({1}, 1, input.dtype()); + } else { + reshape_out_grad = out_grad; + } + if (decrease_size > 0 && (decrease_size != static_cast(in_dims.size()))) { auto out_tmp = - pad(reshape(out_grad, origin_out_shape), paddings, 0.0); + pad(reshape(reshape_out_grad, origin_out_shape), paddings, 0.0); set_output(out_tmp, input_grad); } else { - auto out_tmp = pad(out_grad, paddings, 0.0); + auto out_tmp = pad(reshape_out_grad, paddings, 0.0); set_output(out_tmp, input_grad); } } @@ -1127,11 +1134,13 @@ void prod_grad(const Tensor& x, } else { reduce_all = false; } - auto x_grad_tmp = Tensor(); - auto out_tmp = Tensor(); + auto out_grad_tmp = Tensor(); + auto x_reshape = Tensor(); + std::vector unchange_axis, change_axis, transpose_shape, + cumprod_shape; + std::vector transpose_dim, origin_position; if (x_dim_size == 1) { - x_grad_tmp = out_grad.expand(IntArray(x_dim)); - out_tmp = out.expand(IntArray(x_dim)); + out_grad_tmp = out_grad.expand(IntArray(x_dim)); } else { if (!keep_dim) { auto axis_ = std::vector(); @@ -1149,16 +1158,69 @@ void prod_grad(const Tensor& x, } auto out_grad_shape = get_unsqueeze_dims(out_grad, axis_); auto out_grad_ = reshape(out_grad, out_grad_shape); - x_grad_tmp = out_grad_.expand(IntArray(x_dim)); - auto out_ = reshape(out, out_grad_shape); - out_tmp = out_.expand(IntArray(x_dim)); + out_grad_tmp = out_grad_.expand(IntArray(x_dim)); } else { - x_grad_tmp = out_grad.expand(IntArray(x_dim)); - out_tmp = out.expand(IntArray(x_dim)); + out_grad_tmp = out_grad.expand(IntArray(x_dim)); } } - auto x_grad_res = x_grad_tmp * out_tmp * (1 / x); - set_output(x_grad_res, x_grad); + auto axis_ = std::vector(); + if (reduce_all) { + int64_t numel = 1; + for (int64_t i = 0; i < x_dim_size; i++) { + axis_.push_back(i); + numel *= x_dim[i]; + } + cumprod_shape.push_back(numel); + x_reshape = reshape(x, cumprod_shape); + auto left_cumprod = cumprod(x_reshape, -1, true, false); + auto right_cumprod = cumprod(x_reshape, -1, true, true); + auto x_grad_tmp = left_cumprod * right_cumprod; + auto x_grad_tmp2 = reshape(x_grad_tmp, x.shape()); + auto x_grad_res = x_grad_tmp2 * out_grad_tmp; + set_output(x_grad_res, x_grad); + } else { + int64_t unchange_size = x_dim_size - axis_size; + int64_t unchange_index = 0; + for (int64_t i = 0; i < axis_size; i++) { + if (axis[i] < 0) { + axis_.push_back(axis[i] + x_dim_size); + } else { + axis_.push_back(axis[i]); + } + } + for (int64_t i = 0; i < x_dim_size; i++) { + auto it = find(axis_.begin(), axis_.end(), i); + if (it != axis_.end()) { + int64_t index = it - axis_.begin(); + origin_position.push_back(static_cast(unchange_size + index)); + } else { + unchange_axis.push_back(i); + origin_position.push_back(static_cast(unchange_index)); + unchange_index += 1; + } + } + int64_t numel = 1; + for (int64_t i = 0; i < unchange_size; i++) { + transpose_shape.push_back(x_dim[unchange_axis[i]]); + cumprod_shape.push_back(x_dim[unchange_axis[i]]); + transpose_dim.push_back(static_cast(unchange_axis[i])); + } + for (int64_t i = 0; i < axis_size; i++) { + transpose_shape.push_back(x_dim[axis_[i]]); + transpose_dim.push_back(static_cast(axis_[i])); + numel *= x_dim[axis_[i]]; + } + cumprod_shape.push_back(numel); + auto x_transpose = transpose(x, transpose_dim); + x_reshape = reshape(x_transpose, cumprod_shape); + auto left_cumprod = cumprod(x_reshape, -1, true, false); + auto right_cumprod = cumprod(x_reshape, -1, true, true); + auto x_grad_tmp = left_cumprod * right_cumprod; + auto x_grad_reshape = reshape(x_grad_tmp, transpose_shape); + auto x_grad_tmp2 = transpose(x_grad_reshape, origin_position); + auto x_grad_res = x_grad_tmp2 * out_grad_tmp; + set_output(x_grad_res, x_grad); + } } } diff --git a/paddle/fluid/prim/api/manual_prim/utils/static_utils.cc b/paddle/fluid/prim/api/manual_prim/utils/static_utils.cc index 2f76e8bbd966f..43ab21ccd3e06 100644 --- a/paddle/fluid/prim/api/manual_prim/utils/static_utils.cc +++ b/paddle/fluid/prim/api/manual_prim/utils/static_utils.cc @@ -23,8 +23,7 @@ #include "paddle/fluid/prim/utils/static/static_global_utils.h" #include "paddle/phi/api/include/tensor.h" #include "paddle/phi/core/utils/data_type.h" -namespace paddle { -namespace prim { +namespace paddle::prim { using Tensor = paddle::Tensor; template <> TEST_API Tensor empty(const paddle::experimental::IntArray& shape, @@ -69,5 +68,4 @@ void by_pass(const paddle::Tensor& x, paddle::Tensor* real_out) { set_output(out, real_out); } -} // namespace prim -} // namespace paddle +} // namespace paddle::prim diff --git a/paddle/fluid/prim/utils/static/static_global_utils.cc b/paddle/fluid/prim/utils/static/static_global_utils.cc index 3d1aa2158048d..71179429dc997 100644 --- a/paddle/fluid/prim/utils/static/static_global_utils.cc +++ b/paddle/fluid/prim/utils/static/static_global_utils.cc @@ -14,12 +14,10 @@ #include "paddle/fluid/prim/utils/static/static_global_utils.h" -namespace paddle { -namespace prim { +namespace paddle::prim { StaticCompositeContext* StaticCompositeContext::static_composite_context_ = new StaticCompositeContext(); thread_local bool StaticCompositeContext::enable_bwd_prim_ = false; thread_local bool StaticCompositeContext::enable_fwd_prim_ = false; thread_local bool StaticCompositeContext::enable_eager_prim_ = false; -} // namespace prim -} // namespace paddle +} // namespace paddle::prim diff --git a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_eager_backend.cc.j2 b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_eager_backend.cc.j2 index 7f9f4b5b8676f..b8910ff5b9d9a 100644 --- a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_eager_backend.cc.j2 +++ b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_eager_backend.cc.j2 @@ -12,7 +12,7 @@ namespace backend { {%- macro args(inputs, attrs) -%} {#- Arguments are variable pass into method -#} {{common.sequence('', '', ', ', inputs)}} - {%- if attrs|length > 0 -%} {{", "}} {%- endif -%} {#- append comma between + {%- if attrs|length > 0 -%} {{", "}} {%- endif -%} {#- append comma between nputs and attrs -#} {{common.sequence('', '', ', ', attrs)}} {%- endmacro -%} @@ -37,7 +37,7 @@ return ::{{name}}_ad_func({{common.args(input_names, attr_names)}}); {% for api in apis %} {%- if api.is_prim and api.name not in backend_black_list and api.name[-1] != '_' -%} {{sig(api.name, api.inputs, api.attrs, api.outputs | trip_intermediate)}} { -{{body(api.name, api.inputs, api.attrs, api.outputs | trip_intermediate)}} +{{body(api.name, api.inputs, api.attrs, api.outputs | trip_intermediate)}} } {% endif %} diff --git a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_static_backend.cc.j2 b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_static_backend.cc.j2 index 26f81d756f0b5..8e4921acbb013 100644 --- a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_static_backend.cc.j2 +++ b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_static_backend.cc.j2 @@ -117,20 +117,20 @@ pir::Value {{attr.name}}_res = std::static_pointer_cast({{attr.name~ {% endif %} {% endfor %} {%- set input_names = [] -%} - {%- for i in inputs -%} - {%- do input_names.append(i.name~'_res') -%} + {%- for i in inputs -%} + {%- do input_names.append(i.name~'_res') -%} {%- endfor -%} {%- if mutable_attribute_as_inputs -%} - {%- for i in attrs -%} + {%- for i in attrs -%} {%- if i is mutable_attribute -%} - {%- do input_names.append(i.name~'_res') -%} + {%- do input_names.append(i.name~'_res') -%} {%- endif -%} {%- endfor -%} {%- endif -%} {%- set attr_names = [] -%} - {%- for i in attrs -%} + {%- for i in attrs -%} {%- if not mutable_attribute_as_inputs or mutable_attribute_as_inputs and i is not mutable_attribute -%}{#- do nothing -#} - {%- do attr_names.append(common.phi2ir_attr(i)) -%} + {%- do attr_names.append(common.phi2ir_attr(i)) -%} {%- endif -%} {% endfor %} auto op_res = paddle::dialect::{{name}}({{common.args(input_names, attr_names)}}); @@ -145,14 +145,14 @@ auto op_res = paddle::dialect::{{name}}({{common.args(input_names, attr_names)}} {% set api_outputs = api.outputs | trip_intermediate %} {{sig(api.name, api.inputs, api_outputs, api.attrs)}} { {% filter indent(2, True) %} -{{body(api.name, api.inputs, api_outputs, api.attrs)}} +{{body(api.name, api.inputs, api_outputs, api.attrs)}} {% endfilter %} } {% if api.attrs is exist_mutable_attribute %} {{sig(api.name, api.inputs, api_outputs, api.attrs, True)}} { {% filter indent(2, True) %} -{{body(api.name, api.inputs, api_outputs, api.attrs, True)}} +{{body(api.name, api.inputs, api_outputs, api.attrs, True)}} {% endfilter %} } diff --git a/paddle/fluid/primitive/codegen/templates/common.j2 b/paddle/fluid/primitive/codegen/templates/common.j2 index b29401133db03..ecf5e54cae33b 100644 --- a/paddle/fluid/primitive/codegen/templates/common.j2 +++ b/paddle/fluid/primitive/codegen/templates/common.j2 @@ -8,12 +8,12 @@ template {%- set input_params = [] -%} {%- for i in inputs -%} {%- do input_params.append(i.typename|to_paddle_input_type(i.optional)~' '~i.name) -%} {%- endfor -%} {%- set attr_params = [] -%} - {%- for i in attrs -%} + {%- for i in attrs -%} {%- if not mutable_attribute_as_inputs or i is not mutable_attribute -%} {%- if default -%} - {%- do attr_params.append(i.typename|to_paddle_attr_type~' '~i.name~default_value(i)) -%} + {%- do attr_params.append(i.typename|to_paddle_attr_type~' '~i.name~default_value(i)) -%} {%- else -%} - {%- do attr_params.append(i.typename|to_paddle_attr_type~' '~i.name) -%} + {%- do attr_params.append(i.typename|to_paddle_attr_type~' '~i.name) -%} {%- endif -%} {%- else -%} {%- do input_params.append('const Tensor&'~' '~i.name~'_') -%} @@ -43,7 +43,7 @@ template {%- macro ret(outputs) -%} {%- set names = [] -%} {%- for i in outputs -%} {%- do names.append(i.typename|to_paddle_output_type(i.optional)) -%} {%- endfor -%} - {%- if names|length > 1 -%} + {%- if names|length > 1 -%} std::tuple<{{sequence('', '', ', ', names)}}> {%- else -%} {{names[0]}} diff --git a/paddle/fluid/primitive/codegen/templates/decomp/generated_decomp_vjp.j2 b/paddle/fluid/primitive/codegen/templates/decomp/generated_decomp_vjp.j2 index 460b8e3a2fcdc..592b45b84aa72 100644 --- a/paddle/fluid/primitive/codegen/templates/decomp/generated_decomp_vjp.j2 +++ b/paddle/fluid/primitive/codegen/templates/decomp/generated_decomp_vjp.j2 @@ -139,13 +139,13 @@ std::vector> {{class_name}}::DecompVjp(pir::Operation* o auto stop_gradients_attr = op->attribute(kAttrStopGradients) .dyn_cast() .AsVector(); - {% for k in range(outputs|length) %} + {% for k in range(outputs|length) %} stop_gradients[{{k}}].push_back( stop_gradients_attr[{{k}}].dyn_cast().data()); - {% endfor %} + {% endfor %} VLOG(4) << " stop_gradients is set "; } else { - {% for k in range(outputs|length) %} + {% for k in range(outputs|length) %} stop_gradients[{{k}}].push_back(false); {% endfor %} VLOG(4) << " stop_gradients is not set "; @@ -160,7 +160,7 @@ std::vector> {{class_name}}::DecompVjp(pir::Operation* o VLOG(4) << "Call Pir Decomposed backward op {{fwd_name}}"; - {% for k in range(outputs|length) %} + {% for k in range(outputs|length) %} paddle::Tensor* {{outputs[k].name}} = !stop_gradients[{{k}}][0] ? &tensor_res[{{k}}][0] : nullptr; {% endfor %} diff --git a/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2 b/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2 index 105175758f22d..31ec42aacd7a9 100644 --- a/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2 +++ b/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2 @@ -79,11 +79,11 @@ auto {{i.name}} = phi::IntArray(paddle::dialect::GetInt64Vector({{i.name}}_defin {%- for api in apis -%} {%- do api_map.update({api.name: api}) -%} {%- endfor -%} {%- for i in api.inputs -%} {%- do input_names.append(i.name) -%} {%- endfor -%} {%- set attr_names=[] -%} - {%- for i in api.attrs -%} + {%- for i in api.attrs -%} {%- if i is mutable_attribute -%} - {%- do input_names.append(i.name~'_') -%} + {%- do input_names.append(i.name~'_') -%} {%- else -%} - {%- do attr_names.append(i.name) -%} + {%- do attr_names.append(i.name) -%} {%- endif -%} {%- endfor %} {% if 'invoke' in api and api.invoke.func in api_map %} @@ -116,7 +116,7 @@ FLAGS_tensor_operants_mode = "static"; VLOG(4) << "Call Pir Decomposed backward op {{api.name}}"; {% for i in range(api.outputs|length) %} {% if api.outputs[i].typename=='Tensor' %} -paddle::Tensor* {{api.outputs[i].name}} = !stop_gradients[{{i}}][0] ? &vjp_res[{{i}}][0] : nullptr; +paddle::Tensor* {{api.outputs[i].name}} = !stop_gradients[{{i}}][0] ? &vjp_res[{{i}}][0] : nullptr; {% else %} std::vector {{api.outputs[i].name}}(stop_gradients[{{i}}].size(), nullptr); for (size_t i=0; i< stop_gradients[{{i}}].size(); i++ ) { diff --git a/paddle/fluid/primitive/composite/composite.h b/paddle/fluid/primitive/composite/composite.h index 091d540aa461a..7b08f9f6571fd 100644 --- a/paddle/fluid/primitive/composite/composite.h +++ b/paddle/fluid/primitive/composite/composite.h @@ -23,9 +23,6 @@ namespace paddle { namespace primitive { namespace details { -// empty_shape means x.shape=[] -static std::vector empty_shape; - template static Tensor get_slice(const Tensor& x, int64_t idx) { return slice(x, {0}, {idx}, {idx + 1}, {1}, {}); @@ -98,7 +95,7 @@ Tensor mean_decomp(const Tensor& x, const IntArray& axis, bool keepdim) { for (size_t i = 0; i < axis_.size(); i++) { value_ *= x_dim[axis_[i]]; } - value = full(empty_shape, value_, sum_x.dtype()); + value = full_scalar(value_, sum_x.dtype()); } Tensor res = sum_x / value; @@ -148,7 +145,7 @@ Tensor p_norm_decomp(const Tensor& x, Tensor res; if (porder == 0.0) { // 0-norm - auto zero = full(empty_shape, 0, x_tmp.dtype()); + auto zero = full_scalar(0, x_tmp.dtype()); auto none_zero = not_equal(x_tmp, zero); res = cast(none_zero, x_tmp.dtype()); res = sum(res, {axis}, x_tmp.dtype(), keepdim); @@ -169,8 +166,8 @@ Tensor p_norm_decomp(const Tensor& x, res = min(x_tmp, {axis}, keepdim); } else { // vanilla p-norm - auto porder_tensor = full(empty_shape, porder, x_tmp.dtype()); - auto inv_porder_tensor = full(empty_shape, 1 / porder, x_tmp.dtype()); + auto porder_tensor = full_scalar(porder, x_tmp.dtype()); + auto inv_porder_tensor = full_scalar(1 / porder, x_tmp.dtype()); res = elementwise_pow(x_tmp, porder_tensor); res = sum(res, {axis}, x_tmp.dtype(), keepdim); res = elementwise_pow(res, inv_porder_tensor); @@ -194,8 +191,7 @@ Tensor pow_decomp(const Tensor& x, const paddle::Scalar& y) { } check_valid_type(y.dtype()); - Tensor y_full = full(empty_shape, y, x_cast.dtype()); - + Tensor y_full = full_scalar(y, x_cast.dtype()); auto ans = elementwise_pow(x_cast, y_full); if (need_cast) { return cast(ans, org_dtype); @@ -282,13 +278,13 @@ Tensor squared_l2_norm_decomp(const Tensor& x) { template Tensor reciprocal_decomp(const Tensor& x) { - return full(empty_shape, 1.0, x.dtype()) / x; + return full_scalar(1.0, x.dtype()) / x; } template Tensor bce_loss_decomp(const Tensor& x, const Tensor& label) { - auto one = full(empty_shape, 1, x.dtype()); - auto ans = full(empty_shape, -1, x.dtype()) * + auto one = full_scalar(1, x.dtype()); + auto ans = full_scalar(-1, x.dtype()) * (label * log(x) + (one - label) * log(one - x)); return ans; } @@ -382,7 +378,7 @@ std::tuple batch_norm_decomp( } } - Tensor half = full(empty_shape, -0.5, x_cast.dtype()); + Tensor half = full_scalar(-0.5, x_cast.dtype()); bool use_run_stat = (is_test && (!trainable_statistics)) || use_global_stats; Tensor x_hat; @@ -421,9 +417,8 @@ std::tuple batch_norm_decomp( run_var_ = assign(run_var); } Tensor y; - Tensor new_scale = - scale ? scale.get() : full(empty_shape, 1, x_cast.dtype()); - Tensor new_bias = bias ? bias.get() : full(empty_shape, 0, x_cast.dtype()); + Tensor new_scale = scale ? scale.get() : full_scalar(1, x_cast.dtype()); + Tensor new_bias = bias ? bias.get() : full_scalar(0, x_cast.dtype()); if (data_layout_ == DataLayout::kNHWC) { y = x_hat * new_scale + new_bias; } else { @@ -539,13 +534,13 @@ Tensor swiglu_decomp(const Tensor& x, const paddle::optional& y) { template Tensor relu_decomp(const Tensor& x) { - return maximum(x, full(empty_shape, 0.0, x.dtype())); + return maximum(x, full_scalar(0.0, x.dtype())); } template Tensor relu6_decomp(const Tensor& x) { - auto tmp = maximum(x, full(empty_shape, 0.0, x.dtype())); - auto res = minimum(tmp, full(empty_shape, 6.0, x.dtype())); + auto tmp = maximum(x, full_scalar(0.0, x.dtype())); + auto res = minimum(tmp, full_scalar(6.0, x.dtype())); return res; } @@ -653,7 +648,7 @@ std::tuple layer_norm_decomp( auto difference = x_cast - mean_; auto var_tmp1 = difference * difference; auto variance = mean_decomp(var_tmp1, axis, true); - auto var_tmp3 = variance + full(empty_shape, epsilon, variance.dtype()); + auto var_tmp3 = variance + full_scalar(epsilon, variance.dtype()); auto rsqrt_var = rsqrt(var_tmp3); auto out = difference * rsqrt_var; @@ -798,18 +793,18 @@ std::tuple dropout_decomp( Tensor uniform_tensor; if (has_dynamic_shape(x.shape())) { auto shape_tensor = shape(x); - auto zero = full(empty_shape, 0.0, dtype_tmp); - auto one = full(empty_shape, 1.0, dtype_tmp); + auto zero = full_scalar(0.0, dtype_tmp); + auto one = full_scalar(1.0, dtype_tmp); uniform_tensor = backend::uniform(shape_tensor, zero, one, dtype_tmp, seed_tmp); } else { uniform_tensor = uniform(phi::vectorize(x.dims()), dtype_tmp, 0.0, 1.0, seed_tmp); } - auto mask = cast( - greater_equal(uniform_tensor, full(empty_shape, p, dtype_tmp)), - org_dtype); - auto ones_p = full(empty_shape, 1.0 - p.to(), org_dtype); + auto mask = + cast(greater_equal(uniform_tensor, full_scalar(p, dtype_tmp)), + org_dtype); + auto ones_p = full_scalar(1.0 - p.to(), org_dtype); if (upscale_in_train) { if (is_test) { // inference: out = input @@ -818,7 +813,7 @@ std::tuple dropout_decomp( // train: out = input * mask / ( 1.0 - p ) if (p.to() == 1.0) { // Process p=1. for avoid divide zero error (x*mask/(1.0-p)) - auto zero = full(empty_shape, 0.0, org_dtype); + auto zero = full_scalar(0.0, org_dtype); return std::make_tuple(x * zero, cast(zero, DataType::UINT8)); } else { auto ans = (x * mask) / ones_p; @@ -842,20 +837,20 @@ Tensor gelu_decomp(const Tensor& x, bool approximate) { const double PM_SQRT1_2 = 0.70710678118654752440; /* 1/sqrt(2) */ auto org_dtype = x.dtype(); - auto half = full(empty_shape, 0.5, org_dtype); - auto one = full(empty_shape, 1.0, org_dtype); + auto half = full_scalar(0.5, org_dtype); + auto one = full_scalar(1.0, org_dtype); if (approximate) { // gelu(x) = 0.5 * x * (1 + tanh(sqrt(2 / \pi) * (x + 0.044715 * x^{3}))) - auto kAlpha = full(empty_shape, PM_2_SQRTPI * PM_SQRT1_2, org_dtype); - auto GELU_CONSTANT = full(empty_shape, 0.044715, org_dtype); - auto x_pow3 = elementwise_pow(x, full(empty_shape, 3, org_dtype)); + auto kAlpha = full_scalar(PM_2_SQRTPI * PM_SQRT1_2, org_dtype); + auto GELU_CONSTANT = full_scalar(0.044715, org_dtype); + auto x_pow3 = elementwise_pow(x, full_scalar(3, org_dtype)); auto tanh_out = tanh(kAlpha * (x + x_pow3 * GELU_CONSTANT)); auto res = x * half * (one + tanh_out); return res; } else { // gelu(x) = 0.5 * x * (1 + erf(x / sqrt(2))) - auto M_SQRT1_2T = full(empty_shape, PM_SQRT1_2, org_dtype); + auto M_SQRT1_2T = full_scalar(PM_SQRT1_2, org_dtype); auto erf_out = one + erf(x * M_SQRT1_2T); auto res = x * half * erf_out; @@ -867,10 +862,10 @@ template Tensor hardsigmoid_decomp(const Tensor& x, float slope, float offset) { const double MAX_VALUE = 1.0; const double MIN_VALUE = 0.0; - return maximum(minimum(x * full(empty_shape, slope, x.dtype()) + - full(empty_shape, offset, x.dtype()), - full(empty_shape, MAX_VALUE, x.dtype())), - full(empty_shape, MIN_VALUE, x.dtype())); + return maximum(minimum(x * full_scalar(slope, x.dtype()) + + full_scalar(offset, x.dtype()), + full_scalar(MAX_VALUE, x.dtype())), + full_scalar(MIN_VALUE, x.dtype())); } template @@ -881,15 +876,15 @@ Tensor hardswish_decomp(const Tensor& x) { // out = minimum(maximum(x + offset, 0), threshold) * x / scale auto minimum_out = - minimum(maximum(x + full(empty_shape, OFFSET, x.dtype()), - full(empty_shape, 0.0, x.dtype())), - full(empty_shape, THRESHOLD, x.dtype())); - return (minimum_out * x) / full(empty_shape, SCALE, x.dtype()); + minimum(maximum(x + full_scalar(OFFSET, x.dtype()), + full_scalar(0.0, x.dtype())), + full_scalar(THRESHOLD, x.dtype())); + return (minimum_out * x) / full_scalar(SCALE, x.dtype()); } template Tensor leaky_relu_decomp(const Tensor& x, float negative_slope) { - auto multiply_tmp = full(empty_shape, negative_slope, x.dtype()) * x; + auto multiply_tmp = full_scalar(negative_slope, x.dtype()) * x; if (negative_slope < 1.0) { return maximum(x, multiply_tmp); } else { @@ -1127,8 +1122,7 @@ std::tuple group_norm_decomp( var_ = maximum( var_tmp_, backend::full_with_tensor(shape(var_tmp_), 0, var_tmp_.dtype())); - Tensor var_inv = - rsqrt(var_ + full(empty_shape, epsilon, var_.dtype())); + Tensor var_inv = rsqrt(var_ + full_scalar(epsilon, var_.dtype())); Tensor res = (x_cast - mean_) * var_inv; out = backend::reshape(res, x_dim_t); } else { @@ -1143,7 +1137,7 @@ std::tuple group_norm_decomp( auto var_tmp_ = mean_decomp(x_cast * x_cast, c_axis, true) - mean_ * mean_; var_ = maximum(var_tmp_, full(var_tmp_.shape(), 0, var_tmp_.dtype())); - auto var_inv = rsqrt(var_ + full(empty_shape, epsilon, var_.dtype())); + auto var_inv = rsqrt(var_ + full_scalar(epsilon, var_.dtype())); auto res = (x_cast - mean_) * var_inv; out = reshape(res, x_dim); } @@ -1207,7 +1201,7 @@ Tensor square_decomp(const Tensor& x) { } Tensor two; - two = full(empty_shape, 2, x_cast.dtype()); + two = full_scalar(2, x_cast.dtype()); auto ans = elementwise_pow(x_cast, two); if (need_cast) { @@ -1224,9 +1218,8 @@ Tensor sigmoid_cross_entropy_with_logits_decomp( const paddle::optional& pos_weight, bool normalize, int ignore_index) { - auto dims = x.shape(); - const Tensor zero = full(dims, 0, x.type()); - const Tensor one = full(dims, 1, x.type()); + const Tensor zero = full_like_decomp(x, 0, x.type(), x.place()); + const Tensor one = full_like_decomp(x, 1, x.type(), x.place()); Tensor pos_weight_tensor; if (pos_weight) { pos_weight_tensor = pos_weight.get(); @@ -1235,19 +1228,20 @@ Tensor sigmoid_cross_entropy_with_logits_decomp( } auto term1 = where(x > zero, x, zero); auto term2 = x * label; - auto term3 = log(1 + exp(-abs(x))); + auto term3 = log(one + exp(-abs(x))); const Tensor tmp_out = term1 - term2 + term3 * pos_weight_tensor; - const Tensor ignore_index_tensor = full(dims, ignore_index, label.type()); + const Tensor ignore_index_tensor = + full_like_decomp(x, ignore_index, label.type(), label.place()); auto out = where(label == ignore_index_tensor, zero, tmp_out); if (normalize) { // Follow the implementation in // paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_kernel.cc - const Tensor eps1 = full(dims, 1e-6, x.type()); + const Tensor eps1 = full_like_decomp(x, 1e-6, x.type(), x.place()); auto diff = label - ignore_index_tensor; const Tensor tmp_norm = sum(where(abs(diff) > eps1, one, zero)); // Follow the implementation in // paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_kernel.cc - const Tensor eps2 = full(empty_shape, 1e-5, x.type()); + const Tensor eps2 = full_scalar(1e-5, x.type()); auto norm = where(tmp_norm > eps2, tmp_norm, eps2); out = out / norm; } @@ -1387,8 +1381,8 @@ Tensor elu_decomp(const Tensor& x, const float alpha) { if (has_dynamic_shape(x_cast.shape())) { zero = backend::full_with_tensor(shape(x_cast), 0, x_cast.dtype()); - tmp_res = full(empty_shape, alpha, x_cast.dtype()) * - (exp(x_cast) - full(empty_shape, 1, x_cast.dtype())); + tmp_res = full_scalar(alpha, x_cast.dtype()) * + (exp(x_cast) - full_scalar(1, x_cast.dtype())); } else { zero = full(x_cast.shape(), 0, x_cast.type()); tmp_res = alpha * (exp(x_cast) - 1); diff --git a/paddle/fluid/primitive/manual/manual_primitive.h b/paddle/fluid/primitive/manual/manual_primitive.h index f2ec3ebce45b3..6587adf862a6e 100644 --- a/paddle/fluid/primitive/manual/manual_primitive.h +++ b/paddle/fluid/primitive/manual/manual_primitive.h @@ -30,6 +30,15 @@ Tensor full(const IntArray& shape, return backend::full(shape, value, dtype, place); } +template +Tensor full_scalar(const Scalar& value, + DataType dtype = DataType::FLOAT32, + Place place = Place()) { + // empty_shape means x.shape=[] + std::vector empty_shape; + return backend::full(empty_shape, value, dtype, place); +} + template Tensor assign_out_(const Tensor& x, const Tensor& output) { return backend::assign_out_(x, output); diff --git a/paddle/fluid/primitive/rule/vjp/details.h b/paddle/fluid/primitive/rule/vjp/details.h index 00e464859e29e..42d3ce7d97dfd 100644 --- a/paddle/fluid/primitive/rule/vjp/details.h +++ b/paddle/fluid/primitive/rule/vjp/details.h @@ -1483,13 +1483,20 @@ void slice_grad(const Tensor& input, paddings.push_back(offsets[i]); paddings.push_back((in_dims[i] - out_dims[i]) - offsets[i]); } + Tensor reshape_out_grad; + if (out_grad.shape().size() == 0) { + reshape_out_grad = full({1}, 1, input.dtype()); + } else { + reshape_out_grad = out_grad; + } + if (decrease_size > 0 && (decrease_size != static_cast(in_dims.size()))) { auto out_tmp = - pad(reshape(out_grad, origin_out_shape), paddings, 0.0); + pad(reshape(reshape_out_grad, origin_out_shape), paddings, 0.0); set_output(out_tmp, input_grad); } else { - auto out_tmp = pad(out_grad, paddings, 0.0); + auto out_tmp = pad(reshape_out_grad, paddings, 0.0); set_output(out_tmp, input_grad); } } @@ -1548,7 +1555,8 @@ void leaky_relu_grad(const Tensor& out, template void sigmoid_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) { if (x_grad) { - set_output(out_grad * (out * (1 - out)), x_grad); + auto one_tensor = full_scalar(1.0, out.dtype()); + set_output(out_grad * (out * (one_tensor - out)), x_grad); } } @@ -1772,11 +1780,13 @@ void prod_grad(const Tensor& x, } else { reduce_all = false; } - auto x_grad_tmp = Tensor(); - auto out_tmp = Tensor(); + auto out_grad_tmp = Tensor(); + auto x_reshape = Tensor(); + std::vector unchange_axis, change_axis, transpose_shape, + cumprod_shape; + std::vector transpose_dim, origin_position; if (x_dim_size == 1) { - x_grad_tmp = out_grad.expand(IntArray(x_dim)); - out_tmp = out.expand(IntArray(x_dim)); + out_grad_tmp = out_grad.expand(IntArray(x_dim)); } else { if (!keep_dim) { auto axis_ = std::vector(); @@ -1794,16 +1804,69 @@ void prod_grad(const Tensor& x, } auto out_grad_shape = get_unsqueeze_dims(out_grad, axis_); auto out_grad_ = reshape(out_grad, out_grad_shape); - x_grad_tmp = out_grad_.expand(IntArray(x_dim)); - auto out_ = reshape(out, out_grad_shape); - out_tmp = out_.expand(IntArray(x_dim)); + out_grad_tmp = out_grad_.expand(IntArray(x_dim)); } else { - x_grad_tmp = out_grad.expand(IntArray(x_dim)); - out_tmp = out.expand(IntArray(x_dim)); + out_grad_tmp = out_grad.expand(IntArray(x_dim)); } } - auto x_grad_res = x_grad_tmp * out_tmp * (1 / x); - set_output(x_grad_res, x_grad); + auto axis_ = std::vector(); + if (reduce_all) { + int64_t numel = 1; + for (int64_t i = 0; i < x_dim_size; i++) { + axis_.push_back(i); + numel *= x_dim[i]; + } + cumprod_shape.push_back(numel); + x_reshape = reshape(x, cumprod_shape); + auto left_cumprod = cumprod(x_reshape, -1, true, false); + auto right_cumprod = cumprod(x_reshape, -1, true, true); + auto x_grad_tmp = left_cumprod * right_cumprod; + auto x_grad_tmp2 = reshape(x_grad_tmp, x.shape()); + auto x_grad_res = x_grad_tmp2 * out_grad_tmp; + set_output(x_grad_res, x_grad); + } else { + int64_t unchange_size = x_dim_size - axis_size; + int64_t unchange_index = 0; + for (int64_t i = 0; i < axis_size; i++) { + if (axis[i] < 0) { + axis_.push_back(axis[i] + x_dim_size); + } else { + axis_.push_back(axis[i]); + } + } + for (int64_t i = 0; i < x_dim_size; i++) { + auto it = find(axis_.begin(), axis_.end(), i); + if (it != axis_.end()) { + int64_t index = it - axis_.begin(); + origin_position.push_back(static_cast(unchange_size + index)); + } else { + unchange_axis.push_back(i); + origin_position.push_back(static_cast(unchange_index)); + unchange_index += 1; + } + } + int64_t numel = 1; + for (int64_t i = 0; i < unchange_size; i++) { + transpose_shape.push_back(x_dim[unchange_axis[i]]); + cumprod_shape.push_back(x_dim[unchange_axis[i]]); + transpose_dim.push_back(static_cast(unchange_axis[i])); + } + for (int64_t i = 0; i < axis_size; i++) { + transpose_shape.push_back(x_dim[axis_[i]]); + transpose_dim.push_back(static_cast(axis_[i])); + numel *= x_dim[axis_[i]]; + } + cumprod_shape.push_back(numel); + auto x_transpose = transpose(x, transpose_dim); + x_reshape = reshape(x_transpose, cumprod_shape); + auto left_cumprod = cumprod(x_reshape, -1, true, false); + auto right_cumprod = cumprod(x_reshape, -1, true, true); + auto x_grad_tmp = left_cumprod * right_cumprod; + auto x_grad_reshape = reshape(x_grad_tmp, transpose_shape); + auto x_grad_tmp2 = transpose(x_grad_reshape, origin_position); + auto x_grad_res = x_grad_tmp2 * out_grad_tmp; + set_output(x_grad_res, x_grad); + } } } diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 6deffc89271f9..aec35c6f6896a 100755 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -14,7 +14,6 @@ set(PYBIND_DEPS pass generate_pass pass_builder - parallel_executor compiled_program profiler layer @@ -130,7 +129,6 @@ set(PYBIND_SRCS protobuf.cc exception.cc op_function_common.cc - parallel_executor.cc compiled_program.cc tensor.cc place.cc diff --git a/paddle/fluid/pybind/graph.cc b/paddle/fluid/pybind/graph.cc index 6acba237ba928..4e5329bbf2bfc 100644 --- a/paddle/fluid/pybind/graph.cc +++ b/paddle/fluid/pybind/graph.cc @@ -47,8 +47,7 @@ using paddle::framework::ir::NodeComp; using paddle::framework::ir::TopologySortOperations; using pybind11::return_value_policy; -namespace paddle { -namespace pybind { +namespace paddle::pybind { void BindGraph(py::module *m) { m->def("graph_safe_remove_nodes", [](Graph *graph, const std::unordered_set &nodes) { @@ -408,5 +407,4 @@ void BindPass(py::module *m) { }); } -} // namespace pybind -} // namespace paddle +} // namespace paddle::pybind diff --git a/paddle/fluid/pybind/manual_static_op_function.h b/paddle/fluid/pybind/manual_static_op_function.h index 872be599d9a76..f41950db85e6d 100644 --- a/paddle/fluid/pybind/manual_static_op_function.h +++ b/paddle/fluid/pybind/manual_static_op_function.h @@ -81,7 +81,7 @@ static PyObject *static_api_set_parameter(PyObject *self, } } -static PyObject *static_api_updata_parameter(PyObject *self, +static PyObject *static_api_update_parameter(PyObject *self, PyObject *args, PyObject *kwargs) { try { @@ -98,7 +98,7 @@ static PyObject *static_api_updata_parameter(PyObject *self, // Call ir static api CallStackRecorder callstack_recoder("uodata_parameter"); callstack_recoder.Record(); - paddle::dialect::updata_parameter(parameter, name); + paddle::dialect::update_parameter(parameter, name); callstack_recoder.AttachToOps(); Py_RETURN_NONE; } catch (...) { @@ -975,10 +975,10 @@ static PyMethodDef ManualOpsAPI[] = { (PyCFunction)(void (*)(void))static_api_set_parameter, METH_VARARGS | METH_KEYWORDS, "C++ interface function for set_parameter."}, - {"updata_parameter", - (PyCFunction)(void (*)(void))static_api_updata_parameter, + {"update_parameter", + (PyCFunction)(void (*)(void))static_api_update_parameter, METH_VARARGS | METH_KEYWORDS, - "C++ interface function for updata_parameter."}, + "C++ interface function for update_parameter."}, {"set_persistable_value", (PyCFunction)(void (*)(void))static_api_set_persistable_value, METH_VARARGS | METH_KEYWORDS, diff --git a/paddle/fluid/pybind/parallel_executor.cc b/paddle/fluid/pybind/parallel_executor.cc deleted file mode 100644 index 7f6b054564bc6..0000000000000 --- a/paddle/fluid/pybind/parallel_executor.cc +++ /dev/null @@ -1,1178 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -// Avoid a problem with copysign defined in pyconfig.h on Windows. -#ifdef copysign -#undef copysign -#endif - -#include -#include -#include -#include -#include -#include -#include // NOLINT // for call_once -#include -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/custom_operator.h" -#include "paddle/fluid/framework/data_layout.h" -#include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/executor_cache.h" -#include "paddle/fluid/framework/executor_gc_helper.h" -#include "paddle/fluid/framework/feed_fetch_method.h" -#include "paddle/fluid/framework/feed_fetch_type.h" -#include "paddle/fluid/framework/garbage_collector.h" -#include "paddle/fluid/framework/io/fs.h" -#include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h" -#include "paddle/fluid/framework/ir/cost_model.h" -#include "paddle/fluid/framework/ir/generate_pass.h" -#include "paddle/fluid/framework/ir/pass_builder.h" -#include "paddle/fluid/framework/lod_rank_table.h" -#include "paddle/fluid/framework/lod_tensor_array.h" -#include "paddle/fluid/framework/new_executor/executor_statistics.h" -#include "paddle/fluid/framework/new_executor/standalone_executor.h" -#include "paddle/fluid/framework/op_info.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/framework/parallel_executor.h" -#include "paddle/fluid/framework/phi_utils.h" -#include "paddle/fluid/framework/prune.h" -#include "paddle/fluid/framework/reader.h" -#include "paddle/fluid/framework/scope_pool.h" -#include "paddle/fluid/framework/selected_rows_utils.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/framework/trainer.h" -#include "paddle/fluid/framework/type_defs.h" -#include "paddle/fluid/framework/version.h" -#include "paddle/fluid/imperative/amp_auto_cast.h" -#include "paddle/fluid/imperative/layer.h" -#include "paddle/fluid/memory/allocation/allocator_strategy.h" -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h" -#endif -#include "paddle/fluid/memory/allocation/mmap_allocator.h" -#include "paddle/fluid/operators/activation_op.h" -#include "paddle/fluid/operators/common_infer_shape_functions.h" -#include "paddle/fluid/platform/cpu_helper.h" -#include "paddle/fluid/platform/device/device_wrapper.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/dynload/dynamic_loader.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/init.h" -#include "paddle/fluid/platform/monitor.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/profiler.h" -#include "paddle/fluid/platform/profiler/event_python.h" -#include "paddle/fluid/platform/profiler/event_tracing.h" -#include "paddle/fluid/platform/profiler/profiler.h" -#include "paddle/fluid/pybind/bind_cost_model.h" -#include "paddle/fluid/pybind/bind_fleet_executor.h" -#include "paddle/fluid/pybind/box_helper_py.h" -#include "paddle/fluid/pybind/communication.h" -#include "paddle/fluid/pybind/compatible.h" -#include "paddle/fluid/pybind/const_value.h" -#include "paddle/fluid/pybind/cuda_streams_py.h" -#include "paddle/fluid/pybind/data_set_py.h" -#include "paddle/fluid/pybind/distributed_py.h" -#include "paddle/fluid/pybind/eager.h" -#include "paddle/fluid/pybind/exception.h" -#include "paddle/fluid/pybind/fleet_wrapper_py.h" -#include "paddle/fluid/pybind/generator_py.h" -#include "paddle/fluid/pybind/global_value_getter_setter.h" -#include "paddle/fluid/pybind/gloo_context_py.h" -#include "paddle/fluid/pybind/gloo_wrapper_py.h" -#include "paddle/fluid/pybind/graph.h" -#include "paddle/fluid/pybind/heter_wrapper_py.h" -#include "paddle/fluid/pybind/imperative.h" -#include "paddle/fluid/pybind/inference_api.h" -#include "paddle/fluid/pybind/io.h" -#include "paddle/fluid/pybind/metrics_py.h" -#include "paddle/fluid/pybind/ps_gpu_wrapper_py.h" -#include "paddle/fluid/pybind/pybind_variant_caster.h" -#include "paddle/phi/backends/cpu/cpu_info.h" -#include "paddle/phi/backends/device_manager.h" -#include "paddle/phi/core/compat/convert_utils.h" -#include "paddle/phi/core/lod_utils.h" -#include "paddle/utils/none.h" - -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/pybind/nccl_wrapper_py.h" -#endif -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/pybind/protobuf.h" -#include "paddle/fluid/pybind/pybind.h" // NOLINT -#include "paddle/fluid/pybind/reader_py.h" -#include "paddle/fluid/pybind/tensor_py.h" -#include "paddle/utils/string/to_string.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" -#endif -#ifndef PADDLE_WITH_HIP -#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h" -#endif -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#endif - -#ifdef PADDLE_WITH_XPU -#include "paddle/fluid/platform/device/xpu/xpu_info.h" -#include "paddle/fluid/platform/device/xpu/xpu_op_list.h" -#endif - -#ifdef PADDLE_WITH_CUSTOM_DEVICE -#include "paddle/phi/capi/capi.h" -#endif - -#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" - -#ifdef PADDLE_WITH_IPU -#include "paddle/fluid/platform/device/ipu/ipu_backend.h" -#include "paddle/fluid/platform/device/ipu/ipu_info.h" -#endif - -#ifdef PADDLE_WITH_CRYPTO -#include "paddle/fluid/pybind/crypto.h" -#endif - -#if defined PADDLE_WITH_PSCORE -#include "paddle/fluid/pybind/fleet_py.h" -#endif - -#ifdef PADDLE_WITH_CINN -#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h" -#endif - -#include "paddle/common/flags.h" -#include "paddle/fluid/eager/api/utils/global_utils.h" -#include "paddle/fluid/imperative/layout_autotune.h" -#include "paddle/fluid/pybind/eager_utils.h" -#include "paddle/fluid/pybind/parallel_executor.h" -#include "paddle/phi/api/ext/op_meta_info.h" -#include "paddle/phi/kernels/autotune/cache.h" -#include "paddle/phi/kernels/autotune/switch_autotune.h" -#include "pybind11/stl.h" - -COMMON_DECLARE_bool(use_mkldnn); - -// disable auto conversion to list in Python -PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray); -PYBIND11_MAKE_OPAQUE(paddle::framework::FetchUnmergedList); -PYBIND11_MAKE_OPAQUE(paddle::framework::FetchList); -PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType); - -namespace paddle { -namespace pybind { -using namespace paddle::framework; // NOLINT -void BindParallelExecutor(pybind11::module &m) { // NOLINT - // -- python binds for parallel executor. - py::class_ pe(m, "ParallelExecutor"); - py::class_ exec_strategy(pe, "ExecutionStrategy"); - - py::enum_(m, "DeviceType", py::arithmetic()) - .value("CPU", paddle::platform::DeviceType::CPU) - .value("CUDA", paddle::platform::DeviceType::CUDA) - .value("XPU", paddle::platform::DeviceType::XPU); - - exec_strategy.def(py::init()) - .def_property( - "num_threads", - [](const ExecutionStrategy &self) { return self.num_threads_; }, - [](ExecutionStrategy &self, size_t num_threads) { - self.num_threads_ = num_threads; - }) - .def_property( - "_use_device", - [](const ExecutionStrategy &self) { return self.use_device_; }, - [](ExecutionStrategy &self, paddle::platform::DeviceType use_device) { - self.use_device_ = use_device; - }) // NOTE(liuyuhui): Doesn't add doc for 'use_device', because - // use_device isn‘t exposed to users. - .def_property( - "allow_op_delay", - [](const ExecutionStrategy &self) { return self.allow_op_delay_; }, - [](ExecutionStrategy &self, bool allow_op_delay) { - self.allow_op_delay_ = allow_op_delay; - }) - .def_property( - "num_iteration_per_drop_scope", - [](const ExecutionStrategy &self) { - return self.num_iteration_per_drop_scope_; - }, - [](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) { - self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope; - }) - .def_property( - "num_iteration_per_run", - [](const ExecutionStrategy &self) { - return self.num_iteration_per_run_; - }, - [](ExecutionStrategy &self, size_t num_iteration_per_run) { - self.num_iteration_per_run_ = num_iteration_per_run; - }) - .def_property( - "use_thread_barrier", - [](const ExecutionStrategy &self) { return self.thread_barrier_; }, - [](ExecutionStrategy &self, bool use_thread_barrier) { - self.thread_barrier_ = use_thread_barrier; - }) - .def_property( - "_dry_run", - [](const ExecutionStrategy &self) { return self.dry_run_; }, - [](ExecutionStrategy &self, bool dry_run) { - self.dry_run_ = dry_run; - }); - - exec_strategy.def_property( - "use_experimental_executor", - [](const ExecutionStrategy &self) { - return self.type_ == ExecutionStrategy::kExperimental; - }, - [](ExecutionStrategy &self, bool experimental) { - self.type_ = experimental ? ExecutionStrategy::kExperimental - : ExecutionStrategy::kDefault; - }); - - py::class_ build_strategy(pe, "BuildStrategy", R"DOC( - BuildStrategy allows the user to more preciously control how to - build the SSA Graph in ParallelExecutor by setting the property. - - Returns: - BuildStrategy: An BuildStrategy object. - - Examples: - .. code-block:: python - - >>> import paddle - >>> import paddle.static as static - - >>> paddle.enable_static() - - >>> data = static.data(name="x", shape=[None, 1], dtype="float32") - >>> hidden = static.nn.fc(data, size=10) - >>> loss = paddle.mean(hidden) - >>> paddle.optimizer.SGD(learning_rate=0.01).minimize(loss) - - >>> build_strategy = static.BuildStrategy() - >>> build_strategy.enable_inplace = True - >>> build_strategy.memory_optimize = True - >>> build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce - >>> program = static.CompiledProgram(static.default_main_program(), build_strategy=build_strategy) -)DOC"); - - py::enum_(build_strategy, "ReduceStrategy") - .value("Reduce", BuildStrategy::ReduceStrategy::kReduce) - .value("AllReduce", BuildStrategy::ReduceStrategy::kAllReduce) - .value("_NoReduce", BuildStrategy::ReduceStrategy::kNoReduce); - py::enum_(build_strategy, - "GradientScaleStrategy") - .value("CoeffNumDevice", - BuildStrategy::GradientScaleStrategy::kCoeffNumDevice) - .value("One", BuildStrategy::GradientScaleStrategy::kOne) - .value("Customized", BuildStrategy::GradientScaleStrategy::kCustomized); - - build_strategy.def(py::init()) - .def("_clear_finalized", &BuildStrategy::ClearFinalized) - .def_property( - "reduce_strategy", - [](const BuildStrategy &self) { return self.reduce_; }, - [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finalized, cannot be " - "configured again.")); - self.reduce_ = strategy; - }, - R"DOC((fluid.BuildStrategy.ReduceStrategy, optional): there are two reduce - strategies in ParallelExecutor, AllReduce and Reduce. If you want - that all the parameters' optimization are done on all devices independently, - you should choose AllReduce; otherwise, if you choose Reduce, all the parameters' - optimization will be evenly distributed to different devices, and then - broadcast the optimized parameter to other devices. - Default is 'AllReduce'. - - Examples: - .. code-block:: python - - >>> import paddle - >>> import paddle.static as static - - >>> paddle.enable_static() - - >>> build_strategy = static.BuildStrategy() - >>> build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce - )DOC") - .def_property( - "gradient_scale_strategy", - [](const BuildStrategy &self) { return self.gradient_scale_; }, - [](BuildStrategy &self, - BuildStrategy::GradientScaleStrategy strategy) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finalized, cannot be " - "configured again.")); - self.gradient_scale_ = strategy; - }, - R"DOC((paddle.static.BuildStrategy.GradientScaleStrategy, optional): there are three - ways of defining :math:`loss@grad` in ParallelExecutor, that is, CoeffNumDevice, - One and Customized. By default, ParallelExecutor sets the :math:`loss@grad` - according to the number of devices. If you want to customize :math:`loss@grad`, - you can choose Customized. Default is 'CoeffNumDevice'. - - Examples: - .. code-block:: python - - >>> import numpy - >>> import paddle - >>> import paddle.static as static - - >>> paddle.enable_static() - - >>> use_cuda = paddle.device.is_compiled_with_cuda - >>> place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() - >>> exe = static.Executor(place) - - >>> data = static.data(name='X', shape=[None, 1], dtype='float32') - >>> hidden = static.nn.fc(data, size=10) - >>> loss = paddle.mean(hidden) - >>> paddle.optimizer.SGD(learning_rate=0.01).minimize(loss) - - >>> exe.run(static.default_startup_program()) - - >>> build_strategy = static.BuildStrategy() - >>> build_strategy.gradient_scale_strategy = \ - ... static.BuildStrategy.GradientScaleStrategy.Customized - >>> compiled_prog = static.CompiledProgram( - ... static.default_main_program(), - ... build_strategy=build_strategy, - >>> ) - - >>> x = numpy.random.random(size=(10, 1)).astype('float32') - >>> loss_grad = numpy.ones((1)).astype("float32") * 0.01 - >>> loss_grad_name = loss.name+"@GRAD" - >>> loss_data = exe.run(compiled_prog, - ... feed={"X": x, loss_grad_name : loss_grad}, - ... fetch_list=[loss.name, loss_grad_name]) - )DOC") - .def_property( - "debug_graphviz_path", - [](const BuildStrategy &self) { return self.debug_graphviz_path_; }, - [](BuildStrategy &self, const std::string &path) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finalized, cannot be " - "configured again.")); - self.debug_graphviz_path_ = path; - }, - R"DOC((str, optional): debug_graphviz_path indicates the path that - writing the SSA Graph to file in the form of graphviz. - It is useful for debugging. Default is empty string, that is, "" - - Examples: - .. code-block:: python - - >>> import paddle - >>> import paddle.static as static - - >>> paddle.enable_static() - - >>> build_strategy = static.BuildStrategy() - >>> build_strategy.debug_graphviz_path = "./graph" - )DOC") - .def_property( - "enable_sequential_execution", - [](const BuildStrategy &self) { - return self.enable_sequential_execution_; - }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finalized, cannot be " - "configured again.")); - self.enable_sequential_execution_ = b; - }, - R"DOC((bool, optional): If set True, the execution order of ops would - be the same as what is in the program. Default is False. - - Examples: - .. code-block:: python - - >>> import paddle - >>> import paddle.static as static - - >>> paddle.enable_static() - - >>> build_strategy = static.BuildStrategy() - >>> build_strategy.enable_sequential_execution = True - )DOC") - .def_property( - "remove_unnecessary_lock", - [](const BuildStrategy &self) { - return self.remove_unnecessary_lock_; - }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finalized, cannot be " - "configured again.")); - self.remove_unnecessary_lock_ = b; - }, - R"DOC((bool, optional): If set True, some locks in GPU ops would be - released and ParallelExecutor would run faster. Default is True. - - Examples: - .. code-block:: python - - >>> import paddle - >>> import paddle.static as static - - >>> paddle.enable_static() - - >>> build_strategy = static.BuildStrategy() - >>> build_strategy.remove_unnecessary_lock = True - )DOC") - .def_property( - "num_trainers", - [](const BuildStrategy &self) { return self.num_trainers_; }, - [](BuildStrategy &self, int num_trainers) { -#ifdef WIN32 - PADDLE_THROW(platform::errors::Unavailable( - "Distribution mode is not supported on Windows platform.")); -#endif - self.num_trainers_ = num_trainers; - }) - .def_property( - "trainers_endpoints", - [](const BuildStrategy &self) { return self.trainers_endpoints_; }, - [](BuildStrategy &self, - const std::vector &trainers_endpoints) { - self.trainers_endpoints_ = trainers_endpoints; - }) - .def_property( - "trainer_id", - [](const BuildStrategy &self) { return self.trainer_id_; }, - [](BuildStrategy &self, int trainer_id) { - self.trainer_id_ = trainer_id; - }) - .def_property( - "nccl_comm_num", - [](const BuildStrategy &self) { return self.nccl_comm_num_; }, - [](BuildStrategy &self, int nccl_comm_num) { - self.nccl_comm_num_ = nccl_comm_num; - }) - .def_property( - "bkcl_comm_num", - [](const BuildStrategy &self) { return self.bkcl_comm_num_; }, - [](BuildStrategy &self, int bkcl_comm_num) { - self.bkcl_comm_num_ = bkcl_comm_num; - }) - .def_property( - "use_hierarchical_allreduce", - [](const BuildStrategy &self) { - return self.use_hierarchical_allreduce_; - }, - [](BuildStrategy &self, bool use) { - self.use_hierarchical_allreduce_ = use; - }) - .def_property( - "hierarchical_allreduce_inter_nranks", - [](const BuildStrategy &self) { - return self.hierarchical_allreduce_inter_nranks_; - }, - [](BuildStrategy &self, int nranks) { - self.hierarchical_allreduce_inter_nranks_ = nranks; - }) - .def_property( - "build_cinn_pass", - [](const BuildStrategy &self) { return self.build_cinn_pass_; }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finalized, " - "cannot be configured again.")); - self.build_cinn_pass_ = b; - }, - R"DOC((bool, optional): build_cinn_pass indicates whether - to lowering some operators in graph into cinn ops - to execute, which will speed up the process of execution. - Default False. - - Examples: - .. code-block:: python - - >>> import paddle - >>> import paddle.static as static - >>> paddle.enable_static() - >>> build_strategy = static.BuildStrategy() - >>> build_strategy.build_cinn_pass = True - )DOC") - .def_property( - "fuse_elewise_add_act_ops", - [](const BuildStrategy &self) { - return self.fuse_elewise_add_act_ops_; - }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finalized, cannot be " - "configured again.")); - self.fuse_elewise_add_act_ops_ = b; - }, - R"DOC((bool, optional): fuse_elewise_add_act_ops indicate whether - to fuse elementwise_add_op and activation_op, - it may make the execution faster. Default is False. - - Examples: - .. code-block:: python - - >>> import paddle - >>> import paddle.static as static - - >>> paddle.enable_static() - - >>> build_strategy = static.BuildStrategy() - >>> build_strategy.fuse_elewise_add_act_ops = True - )DOC") - .def_property( - "fuse_gemm_epilogue", - [](const BuildStrategy &self) { return self.fuse_gemm_epilogue_; }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finalized, cannot be " - "configured again.")); - self.fuse_gemm_epilogue_ = b; - }, - R"DOC((bool, optional): fuse_gemm_epilogue indicate whether - to fuse matmul_op, elemenewist_add_op and activation_op, - it may make the execution faster. Default is False. - - Examples: - .. code-block:: python - - >>> import paddle - >>> import paddle.static as static - - >>> paddle.enable_static() - - >>> build_strategy = static.BuildStrategy() - >>> build_strategy.fuse_gemm_epilogue = True - )DOC") - .def_property( - "fuse_dot_product_attention", - [](const BuildStrategy &self) { - return self.fuse_dot_product_attention_; - }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finlaized, cannot be " - "configured again.")); - self.fuse_dot_product_attention_ = b; - }, - R"DOC((bool, optional): fuse_dot_product_attention indicate whether - to fuse dot product attention, - it would make the execution faster. Default is False. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - build_strategy = static.BuildStrategy() - build_strategy.fuse_dot_product_attention = True - )DOC") - .def_property( - "fuse_adamw", - [](const BuildStrategy &self) { return self.fuse_adamw_; }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finalized, cannot be " - "configured again.")); - self.fuse_adamw_ = b; - }, - R"DOC((bool, optional): fuse_adamw indicate whether - to fuse all adamw optimizers with multi_tensor_adam, - it may make the execution faster. Default is False. - Examples: - .. code-block:: python - - >>> import paddle - >>> import paddle.static as static - >>> paddle.enable_static() - >>> build_strategy = static.BuildStrategy() - >>> build_strategy.fuse_adamw = True - )DOC") - .def_property( - "fused_attention", - [](const BuildStrategy &self) { return self.fused_attention_; }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finalized, cannot be " - "configured again.")); - self.fused_attention_ = b; - }, - R"DOC((bool, optional): fused_attention indicate whether - to fuse the whole multi head attention part with one op, - it may make the execution faster. Default is False. - - Examples: - .. code-block:: python - - >>> import paddle - >>> import paddle.static as static - - >>> paddle.enable_static() - - >>> build_strategy = static.BuildStrategy() - >>> build_strategy.fused_attention = True - )DOC") - .def_property( - "fused_feedforward", - [](const BuildStrategy &self) { return self.fused_feedforward_; }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finalized, cannot be " - "configured again.")); - self.fused_feedforward_ = b; - }, - R"DOC((bool, optional): fused_feedforward indicate whether - to fuse the whole feed_forward part with one op, - it may make the execution faster. Default is False. - - Examples: - .. code-block:: python - - >>> import paddle - >>> import paddle.static as static - - >>> paddle.enable_static() - - >>> build_strategy = static.BuildStrategy() - >>> build_strategy.fused_feedforward = True - )DOC") - .def_property( - "sequential_run", - [](const BuildStrategy &self) { return self.sequential_run_; }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finalized, cannot be " - "configured again.")); - self.sequential_run_ = b; - }, - R"DOC((bool, optional): sequential_run is used to let the `StandaloneExecutor` run ops by the - order of `ProgramDesc`. Default is False. - - Examples: - .. code-block:: python - - >>> import paddle - >>> import paddle.static as static - - >>> paddle.enable_static() - - >>> build_strategy = static.BuildStrategy() - >>> build_strategy.sequential_run = True - )DOC") - .def_property( - "fuse_resunit", - [](const BuildStrategy &self) { return self.fuse_resunit_; }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finalized, cannot be " - "configured again.")); - self.fuse_resunit_ = b; -#ifndef PADDLE_WITH_CUDNN_FRONTEND - if (self.fuse_resunit_) { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "Paddle is not built with CUDNN Frontend support.")); - } -#endif - }, - R"DOC((bool, optional): fuse_resunit Default is False. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - build_strategy = static.BuildStrategy() - build_strategy.fuse_resunit = True - )DOC") - .def_property( - "fuse_bn_act_ops", - [](const BuildStrategy &self) { return self.fuse_bn_act_ops_; }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finalized, cannot be " - "configured again.")); - self.fuse_bn_act_ops_ = b; - }, - R"DOC((bool, optional): fuse_bn_act_ops indicate whether - to fuse batch_norm and activation_op, - it may make the execution faster. Default is False. - - Examples: - .. code-block:: python - - >>> import paddle - >>> import paddle.static as static - - >>> paddle.enable_static() - - >>> build_strategy = static.BuildStrategy() - >>> build_strategy.fuse_bn_act_ops = True - )DOC") - .def_property( - "fuse_bn_add_act_ops", - [](const BuildStrategy &self) { return self.fuse_bn_add_act_ops_; }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finalized, cannot be " - "configured again.")); - self.fuse_bn_add_act_ops_ = b; - }, - R"DOC((bool, optional): fuse_bn_add_act_ops indicate whether - to fuse batch_norm, elementwise_add and activation_op, - it may make the execution faster. Default is True - - Examples: - .. code-block:: python - - >>> import paddle - >>> import paddle.static as static - - >>> paddle.enable_static() - - >>> build_strategy = static.BuildStrategy() - >>> build_strategy.fuse_bn_add_act_ops = True - )DOC") - .def_property( - "enable_auto_fusion", - [](const BuildStrategy &self) { return self.enable_auto_fusion_; }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finalized, cannot be " - "configured again.")); - self.enable_auto_fusion_ = b; - }, - R"DOC((bool, optional): Whether to enable fusing subgraph to a - fusion_group. Now we only support fusing subgraph that composed - of elementwise-like operators, such as elementwise_add/mul - without broadcast and activations. - - Examples: - .. code-block:: python - - >>> import paddle - >>> import paddle.static as static - - >>> paddle.enable_static() - - >>> build_strategy = static.BuildStrategy() - >>> build_strategy.enable_auto_fusion = True - )DOC") - .def_property( - "fuse_relu_depthwise_conv", - [](const BuildStrategy &self) { - return self.fuse_relu_depthwise_conv_; - }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finalized, cannot be " - "configured again.")); - self.fuse_relu_depthwise_conv_ = b; - }, - R"DOC((bool, optional): fuse_relu_depthwise_conv indicate whether - to fuse relu and depthwise_conv2d, - it will save GPU memory and may make the execution faster. - This options is only available in GPU devices. - Default is False. - - Examples: - .. code-block:: python - - >>> import paddle - >>> import paddle.static as static - - >>> paddle.enable_static() - - >>> build_strategy = static.BuildStrategy() - >>> build_strategy.fuse_relu_depthwise_conv = True - )DOC") - .def_property( - "fuse_broadcast_ops", - [](const BuildStrategy &self) { - return self.fuse_broadcast_ops_ == true || - self.fuse_broadcast_ops_ == paddle::none; - }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finalized, " - "cannot be configured again.")); - self.fuse_broadcast_ops_ = b; - }, - R"DOC((bool, optional): fuse_broadcast_op indicates whether - to fuse the broadcast ops. Note that, in Reduce mode, - fusing broadcast ops may make the program faster. Because - fusing broadcast OP equals delaying the execution of all - broadcast Ops, in this case, all nccl streams are used only - for NCCLReduce operations for a period of time. Default False. - - Examples: - .. code-block:: python - - >>> import paddle - >>> import paddle.static as static - >>> paddle.enable_static() - - >>> build_strategy = static.BuildStrategy() - >>> build_strategy.fuse_broadcast_ops = True - )DOC") - .def_property( - "fuse_all_optimizer_ops", - [](const BuildStrategy &self) { - return self.fuse_all_optimizer_ops_ == true || - self.fuse_all_optimizer_ops_ == paddle::none; - }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finalized, " - "cannot be configured again.")); - self.fuse_all_optimizer_ops_ = b; - }) - .def_property( - "sync_batch_norm", - [](const BuildStrategy &self) { return self.sync_batch_norm_; }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finalized, cannot be " - "configured again.")); - self.sync_batch_norm_ = b; - }, - R"DOC((bool, optional): sync_batch_norm indicates whether to use - synchronous batch normalization which synchronizes the mean - and variance through multi-devices in training phase. - Current implementation doesn't support FP16 training and CPU. - And only synchronous on one machine, not all machines. - Default is False. - - Examples: - .. code-block:: python - - >>> import paddle - >>> import paddle.static as static - - >>> paddle.enable_static() - - >>> build_strategy = static.BuildStrategy() - >>> build_strategy.sync_batch_norm = True - )DOC") - .def_property( - "memory_optimize", - [](const BuildStrategy &self) -> py::object { - if (self.memory_optimize_) { // NOLINT - return py::cast(self.memory_optimize_.get()); - } else { - return py::cast(nullptr); - } - }, - [](BuildStrategy &self, const py::handle &value) { - auto *py_obj = value.ptr(); - if (py_obj == nullptr || py_obj == Py_None) { - self.memory_optimize_ = paddle::none; - } else if (PyBool_Check(py_obj)) { - self.memory_optimize_ = (py_obj == Py_True); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "BuildStrategy.memory_optimize must be set to None, False " - "or True")); - } - }, - R"DOC((bool, optional): memory opitimize aims to save total memory - consumption, set to True to enable it. - - Default None. None means framework would choose to use or not use - this strategy automatically. Currently, None means that it is - enabled when GC is disabled, and disabled when GC is enabled. - True means enabling and False means disabling. Default is None. - - Examples: - .. code-block:: python - - >>> import paddle - >>> import paddle.static as static - - >>> paddle.enable_static() - - >>> build_strategy = static.BuildStrategy() - >>> build_strategy.memory_optimize = True - - )DOC") - .def_property( - "is_distribution", - [](const BuildStrategy &self) { return self.is_distribution_; }, - [](BuildStrategy &self, bool b) { -#ifdef WIN32 - if (b) { - PADDLE_THROW(platform::errors::Unavailable( - "Distribution mode is not supported on Windows platform.")); - } -#else - self.is_distribution_ = b; -#endif - }) - .def_property( - "async_mode", - [](const BuildStrategy &self) { return self.async_mode_; }, - [](BuildStrategy &self, bool b) { self.async_mode_ = b; }) - .def_property( - "enable_inplace", - [](const BuildStrategy &self) { return self.enable_inplace_; }, - [](BuildStrategy &self, bool b) { self.enable_inplace_ = b; }) - .def_property( - "enable_addto", - [](const BuildStrategy &self) { return self.enable_addto_; }, - [](BuildStrategy &self, bool b) { self.enable_addto_ = b; }) - .def_property( - "fuse_all_reduce_ops", - [](const BuildStrategy &self) { - return self.fuse_all_reduce_ops_ == true || - self.fuse_all_reduce_ops_ == paddle::none; - }, - [](BuildStrategy &self, bool b) { self.fuse_all_reduce_ops_ = b; }) - .def_property( - "enable_backward_optimizer_op_deps", - [](const BuildStrategy &self) { - return self.enable_backward_optimizer_op_deps_; - }, - [](BuildStrategy &self, bool b) { - self.enable_backward_optimizer_op_deps_ = b; - }) - .def_property( - "cache_runtime_context", - [](const BuildStrategy &self) { return self.cache_runtime_context_; }, - [](BuildStrategy &self, bool b) { self.cache_runtime_context_ = b; }) - .def_property( - "mkldnn_enabled_op_types", - [](const BuildStrategy &self) { - return self.mkldnn_enabled_op_types_; - }, - [](BuildStrategy &self, - const std::unordered_set &mkldnn_enabled_op_types) { - self.mkldnn_enabled_op_types_ = mkldnn_enabled_op_types; - }) - .def_property( - "fix_op_run_order", - [](const BuildStrategy &self) { return self.fix_op_run_order_; }, - [](BuildStrategy &self, bool fix_op_run_order) { - self.fix_op_run_order_ = fix_op_run_order; - }) - .def_property( - "allow_cuda_graph_capture", - [](const BuildStrategy &self) { - return self.allow_cuda_graph_capture_; - }, - [](BuildStrategy &self, bool allow_cuda_graph_capture) { - self.allow_cuda_graph_capture_ = allow_cuda_graph_capture; - }) - .def("_copy", - [](const BuildStrategy &self) { - auto new_bs = self; - new_bs.ClearFinalized(); - return new_bs; - }) - .def("__str__", - [](const BuildStrategy &self) { - std::stringstream ss; - ss << self; - return ss.str(); - }) - .def( - "_finalize_strategy_and_create_passes", - [](BuildStrategy &self) -> std::shared_ptr { - return self.CreatePassesFromStrategy(true); - }, - R"DOC(Allow user to customized passes. Normally model-specific - optimization passes should be defined in this way. BuildStrategy - cannot be updated after being finalized.)DOC"); - - m.def("_set_cached_executor_build_strategy", - [](int64_t program_id, const BuildStrategy &build_strategy) { - auto &cached_exe_info = framework::ExecutorInfoCache::Instance(); - cached_exe_info.SetBuildStrategy(program_id, build_strategy); - }); - - pe.def(py::init &, - const std::vector &, - const std::string &, - Scope *, - std::vector &, - const ExecutionStrategy &, - const BuildStrategy &, - ir::Graph *>()) - // NOTE: even we return a vec* to Python use reference policy. - // We still cannot get local_scope from this vector, since the element - // of vec will be freed by Python GC. We can only return Scope* - // one by one and mark them as reference. - .def( - "local_scopes", - [](ParallelExecutor &self) -> std::vector * { - return &self.GetLocalScopes(); - }, - py::return_value_policy::reference) - .def("drop_local_exe_scopes", &ParallelExecutor::DropLocalExeScopes) - .def("_need_create_local_exe_scopes", - &ParallelExecutor::NeedCreateLocalExeScope) - .def("feed_tensors_into_local_scopes", - &ParallelExecutor::FeedTensorsIntoLocalScopes) - .def("feed_and_split_tensor_into_local_scopes", - &ParallelExecutor::FeedAndSplitTensorIntoLocalScopes) - .def("run", - [](ParallelExecutor &self, - const std::vector &fetch_tensors, - bool return_merged) -> py::object { - if (return_merged) { - paddle::framework::FetchList ret; - /*gil_scoped_release*/ { - pybind11::gil_scoped_release release; - ret = self.RunAndMerge(fetch_tensors); - } - return py::cast(std::move(ret)); - } else { - paddle::framework::FetchUnmergedList ret; - /*gil_scoped_release*/ { - pybind11::gil_scoped_release release; - ret = self.Run(fetch_tensors); - } - return py::cast(std::move(ret)); - } - }) - .def("device_count", &ParallelExecutor::DeviceCount); - using VarQuantScale = - std::unordered_map>; - py::class_> pass(m, "Pass"); - pass.def(py::init()) - .def("has", &ir::Pass::Has) - .def("set_not_owned", - [](ir::Pass &self, const std::string &attr_name, ProgramDesc &attr) { - self.SetNotOwned(attr_name, &attr); - }) - .def( - "set", - [](ir::Pass &self, const std::string &name, const std::string &attr) { - self.Set(name, new std::string(attr)); - }) - .def("set", - [](ir::Pass &self, const std::string &name, bool val) { - self.Set(name, new bool(val)); - }) - .def("set", - [](ir::Pass &self, const std::string &name, int val) { - self.Set(name, new int(val)); - }) - .def("set", - [](ir::Pass &self, - const std::string &name, - std::vector set) { - self.Set(name, new std::vector(set)); - }) - .def("set", - [](ir::Pass &self, - const std::string &name, - std::unordered_set set) { - self.Set(name, new std::unordered_set(set)); - }) - .def("set", - [](ir::Pass &self, - const std::string &name, - std::unordered_set set) { - self.Set(name, new std::unordered_set(set)); - }) - .def("set", - [](ir::Pass &self, const std::string &name, VarQuantScale scales) { - self.Set(name, new VarQuantScale(scales)); - }) - .def("type", &ir::Pass::Type) - .def("apply", [](ir::Pass &self, std::shared_ptr graph) { - self.Apply(graph.get()); - }); - - py::class_> pb( - m, "PassBuilder"); - pb.def(py::init()) - .def("append_pass", - [](ir::PassBuilder &self, - const std::string &pass_type) -> std::shared_ptr { - return self.AppendPass(pass_type); - }) - .def("all_passes", [](ir::PassBuilder &self) { return self.AllPasses(); }) - .def("insert_pass", - [](ir::PassBuilder &self, size_t idx, const std::string &pass_type) { - return self.InsertPass(idx, pass_type); - }) - .def("remove_pass", - [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); }); -} - -} // namespace pybind -} // namespace paddle diff --git a/paddle/fluid/pybind/pir.cc b/paddle/fluid/pybind/pir.cc index 999475d5944d5..fcd8c12579847 100644 --- a/paddle/fluid/pybind/pir.cc +++ b/paddle/fluid/pybind/pir.cc @@ -116,6 +116,7 @@ using pir::Program; using pir::StrAttribute; using pir::Type; using pir::Value; +using pir::VectorType; using pybind11::return_value_policy; COMMON_DECLARE_bool(print_ir); @@ -411,6 +412,12 @@ void BindProgram(py::module *m) { [](Program &self, IrMapping &ir_mapper) { return Clone(self, &ir_mapper); }) + .def( + "copy_to_block", + [](std::shared_ptr self, + pir::IrMapping &mapper, + Block *block) { return self->CopyToBlock(mapper, block); }, + return_value_policy::reference) .def( "list_vars", [](std::shared_ptr self) { @@ -653,9 +660,12 @@ void BindIrMapping(py::module *m) { ir_mapping.def(py::init<>()) .def("look_up", [](IrMapping &self, Value from) { return self.Lookup(from); }) - .def("add", [](IrMapping &self, Value from, Value to) { - self.Add(from, to); - }); + .def("add", + [](IrMapping &self, Value from, Value to) { + self.Add(from, to); + }) + .def("size", + [](IrMapping &self) { return self.GetMutableMap().size(); }); } void BindCloneOptions(py::module *m) { @@ -1320,6 +1330,13 @@ void BindType(py::module *m) { PADDLE_THROW(phi::errors::InvalidArgument( "can't set _local_shape when building static graph")); }) + .def("as_vec_type", + [](Type self) -> py::object { + if (auto vec_type = self.dyn_cast()) { + return py::cast(vec_type); + } + return py::cast(Py_None); + }) .def("__str__", [](Type &self) { std::ostringstream print_stream; print_stream << self; @@ -1354,7 +1371,13 @@ void BindType(py::module *m) { } }); } - +void BindVectorType(py::module *m) { + py::class_ vec_type(*m, "VectorType"); + vec_type.def("as_list", &VectorType::data); + m->def("create_vec_type", [](std::vector &types) { + return VectorType::get(pir::IrContext::Instance(), types); + }); +} void BindAttribute(py::module *m) { py::class_ ir_attr(*m, "Attribute", py::module_local()); ir_attr.def("__eq__", &Attribute::operator==) @@ -2486,6 +2509,7 @@ void BindPir(pybind11::module *module) { BindOperation(&ir_module); BindOpOperand(&ir_module); BindType(&ir_module); + BindVectorType(&ir_module); BindAttribute(&ir_module); BindInsertionPoint(&ir_module); BindUtils(&ir_module); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index b1163adc932fc..ae49f2594ce0a 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -62,7 +62,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/framework/prune.h" #include "paddle/fluid/framework/raw_tensor.h" @@ -146,7 +145,6 @@ limitations under the License. */ #endif #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/pybind/compiled_program.h" -#include "paddle/fluid/pybind/parallel_executor.h" #include "paddle/fluid/pybind/place.h" #include "paddle/fluid/pybind/protobuf.h" #include "paddle/fluid/pybind/pybind.h" // NOLINT diff --git a/paddle/phi/api/generator/dist_api_gen.py b/paddle/phi/api/generator/dist_api_gen.py index 54605d19b256d..a8a2b23d3d026 100644 --- a/paddle/phi/api/generator/dist_api_gen.py +++ b/paddle/phi/api/generator/dist_api_gen.py @@ -295,7 +295,7 @@ }} std::vector {name}_meta_ptr_vec({name}.size()); for (size_t i = 0; i < {name}_meta_vec.size(); ++i) {{ - {name}_meta_ptr_vec[i] = &{name}_meta_vec[i]; + {name}_meta_ptr_vec[i] = {name}[i] ? &{name}_meta_vec[i] : nullptr; }} """ INFER_GLOBAL_SHAPE_TEMPLATE = """ @@ -400,7 +400,7 @@ std::vector {name}_meta_vec = MakeMetaTensor({name}); std::vector {name}_meta_ptr_vec({name}_meta_vec.size()); for (size_t i = 0; i < {name}_meta_vec.size(); ++i) {{ - {name}_meta_ptr_vec[i] = &{name}_meta_vec[i]; + {name}_meta_ptr_vec[i] = {name}[i] ? &{name}_meta_vec[i] : nullptr; }} """ INFER_META_TEMPLATE = """ diff --git a/paddle/phi/api/generator/dist_bw_api_gen.py b/paddle/phi/api/generator/dist_bw_api_gen.py index 1d57d552d7767..34d495d9d0536 100644 --- a/paddle/phi/api/generator/dist_bw_api_gen.py +++ b/paddle/phi/api/generator/dist_bw_api_gen.py @@ -53,33 +53,41 @@ std::shared_ptr shared_dist_out = CreateKernelDistOutput({}, !rank_is_in_current_mesh, spmd_info.second[0]); phi::distributed::DistTensor* dist_out = shared_dist_out.get(); - phi::DenseTensor* dense_out = dist_out->unsafe_mutable_value(); - if (dense_out && !rank_is_in_current_mesh && !dist_out->defined()) {{ - *dense_out = phi::DenseTensor( + phi::DenseTensor* dense_out = nullptr; + if (dist_out) {{ + dense_out = dist_out->unsafe_mutable_value(); + if (dense_out && !rank_is_in_current_mesh && !dist_out->defined()) {{ + *dense_out = phi::DenseTensor( std::make_shared(nullptr, 0, phi::distributed::GetDefaultPlace()), phi::DenseTensorMeta()); }} + }} """ SINGLE_OUT_CREATION_TEMPLATE = """ std::shared_ptr shared_dist_out = CreateKernelDistOutput({}, !rank_is_in_current_mesh); phi::distributed::DistTensor* dist_out = shared_dist_out.get(); - phi::DenseTensor* dense_out = dist_out->unsafe_mutable_value(); - if (dense_out && !rank_is_in_current_mesh && !dist_out->defined()) {{ + phi::DenseTensor* dense_out = nullptr; + if (dist_out) {{ + dense_out = dist_out->unsafe_mutable_value(); + if (dense_out && !rank_is_in_current_mesh && !dist_out->defined()) {{ *dense_out = phi::DenseTensor( - std::make_shared(nullptr, 0, phi::distributed::GetDefaultPlace()), - phi::DenseTensorMeta()); + std::make_shared(nullptr, 0, phi::distributed::GetDefaultPlace()), + phi::DenseTensorMeta()); + }} }} """ VECTOR_OUT_CREATION_TEMPLATE_WITH_NO_SPMD = """ auto dist_out = SetKernelDistOutput({name}); - std::vector dense_out(dist_out.size()); + std::vector dense_out(dist_out.size(), nullptr); for (size_t i=0; iunsafe_mutable_value(); - if (dense_out[i] && !rank_is_in_current_mesh && !dist_out[i]->defined()) {{ - *dense_out[i] = phi::DenseTensor( - std::make_shared(nullptr, 0, phi::distributed::GetDefaultPlace()), - phi::DenseTensorMeta()); + if (dist_out[i]) {{ + dense_out[i] = dist_out[i]->unsafe_mutable_value(); + if (dense_out[i] && !rank_is_in_current_mesh && !dist_out[i]->defined()) {{ + *dense_out[i] = phi::DenseTensor( + std::make_shared(nullptr, 0, phi::distributed::GetDefaultPlace()), + phi::DenseTensorMeta()); + }} }} }} """ @@ -90,13 +98,15 @@ for(auto& e: shared_dist_out){{ dist_out.push_back(e.get()); }} - std::vector dense_out(dist_out.size()); + std::vector dense_out(dist_out.size(), nullptr); for (size_t i=0; iunsafe_mutable_value(); - if (dense_out[i] && !rank_is_in_current_mesh && !dist_out[i]->defined()) {{ - *dense_out[i] = phi::DenseTensor( - std::make_shared(nullptr, 0, phi::distributed::GetDefaultPlace()), - phi::DenseTensorMeta()); + if (dist_out[i]) {{ + dense_out[i] = dist_out[i]->unsafe_mutable_value(); + if (dense_out[i] && !rank_is_in_current_mesh && !dist_out[i]->defined()) {{ + *dense_out[i] = phi::DenseTensor( + std::make_shared(nullptr, 0, phi::distributed::GetDefaultPlace()), + phi::DenseTensorMeta()); + }} }} }} """ @@ -108,13 +118,15 @@ for(auto& e: shared_dist_out){{ dist_out.push_back(e.get()); }} - std::vector dense_out(dist_out.size()); + std::vector dense_out(dist_out.size(), nullptr); for (size_t i=0; iunsafe_mutable_value(); - if (dense_out[i] && !rank_is_in_current_mesh && !dist_out[i]->defined()) {{ - *dense_out[i] = phi::DenseTensor( - std::make_shared(nullptr, 0, phi::distributed::GetDefaultPlace()), - phi::DenseTensorMeta()); + if (dist_out[i]) {{ + dense_out[i] = dist_out[i]->unsafe_mutable_value(); + if (dense_out[i] && !rank_is_in_current_mesh && !dist_out[i]->defined()) {{ + *dense_out[i] = phi::DenseTensor( + std::make_shared(nullptr, 0, phi::distributed::GetDefaultPlace()), + phi::DenseTensorMeta()); + }} }} }} """ @@ -156,13 +168,15 @@ """ MULTI_VECTOR_OUT_CREATION_TEMPLATE = """ auto dist_out_{i} = SetKernelDistOutput({name}); - std::vector dense_out_{i}(dist_out_{i}.size()); + std::vector dense_out_{i}(dist_out_{i}.size(), nullptr); for (size_t i = 0; i < dist_out_{i}.size(); i++) {{ - dense_out_{i}[i] = const_cast(&dist_out_{i}[i]->value()); - if (dense_out_{i}[i] && !rank_is_in_current_mesh && !dist_out_{i}[i]->defined()) {{ - *dense_out_{i}[i]= phi::DenseTensor( - std::make_shared(nullptr, 0, phi::distributed::GetDefaultPlace()), - phi::DenseTensorMeta()); + if (dist_out_{i}[i]) {{ + dense_out_{i}[i] = const_cast(&dist_out_{i}[i]->value()); + if (dense_out_{i}[i] && !rank_is_in_current_mesh && !dist_out_{i}[i]->defined()) {{ + *dense_out_{i}[i]= phi::DenseTensor( + std::make_shared(nullptr, 0, phi::distributed::GetDefaultPlace()), + phi::DenseTensorMeta()); + }} }} }} """ diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc index ef5cfc90727ff..c6426898371d2 100644 --- a/paddle/phi/api/lib/api_gen_utils.cc +++ b/paddle/phi/api/lib/api_gen_utils.cc @@ -736,6 +736,7 @@ std::shared_ptr CreateKernelDistOutput( } return dist_output; } + VLOG(4) << "CreateKernelDistOutput with NULL out"; return nullptr; } diff --git a/paddle/phi/api/lib/context_pool.cc b/paddle/phi/api/lib/context_pool.cc index ee1e21a58e2f1..e2eb1af09d8a5 100644 --- a/paddle/phi/api/lib/context_pool.cc +++ b/paddle/phi/api/lib/context_pool.cc @@ -23,8 +23,7 @@ limitations under the License. */ #include "paddle/phi/core/cuda_stream.h" #endif -namespace paddle { -namespace experimental { +namespace paddle::experimental { void DeviceContextPool::SyncDeviceContext(const Place& place) { if (!phi::DeviceContextPool::IsInitialized()) { @@ -64,8 +63,7 @@ phi::DeviceContext* DeviceContextPool::GetMutable(const Place& place) { return const_cast(Get(place)); // NOLINT } -} // namespace experimental -} // namespace paddle +} // namespace paddle::experimental namespace paddle { diff --git a/paddle/phi/backends/dynload/cublas.cc b/paddle/phi/backends/dynload/cublas.cc index 2fe9ae774bf7a..b870a90cb091c 100644 --- a/paddle/phi/backends/dynload/cublas.cc +++ b/paddle/phi/backends/dynload/cublas.cc @@ -14,8 +14,7 @@ limitations under the License. */ #include "paddle/phi/backends/dynload/cublas.h" -namespace phi { -namespace dynload { +namespace phi::dynload { std::once_flag cublas_dso_flag; void *cublas_dso_handle = nullptr; @@ -34,5 +33,4 @@ CUBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP); #ifdef CUBLAS_BLAS_ROUTINE_EACH_R4 CUBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP); #endif -} // namespace dynload -} // namespace phi +} // namespace phi::dynload diff --git a/paddle/phi/backends/dynload/cublas.h b/paddle/phi/backends/dynload/cublas.h index 8053bbb6bd2ce..6da85283d6e71 100644 --- a/paddle/phi/backends/dynload/cublas.h +++ b/paddle/phi/backends/dynload/cublas.h @@ -94,8 +94,14 @@ extern void *cublas_dso_handle; __macro(cublasSgetriBatched); \ __macro(cublasDgetrfBatched); \ __macro(cublasDgetriBatched); \ + __macro(cublasCgetrfBatched); \ + __macro(cublasCgetriBatched); \ + __macro(cublasZgetrfBatched); \ + __macro(cublasZgetriBatched); \ __macro(cublasSmatinvBatched); \ __macro(cublasDmatinvBatched); \ + __macro(cublasCmatinvBatched); \ + __macro(cublasZmatinvBatched); \ __macro(cublasSgetrsBatched); \ __macro(cublasDgetrsBatched); diff --git a/paddle/phi/backends/dynload/cusparse.cc b/paddle/phi/backends/dynload/cusparse.cc index ce8f87dc3cdfa..9d89b746df5b7 100644 --- a/paddle/phi/backends/dynload/cusparse.cc +++ b/paddle/phi/backends/dynload/cusparse.cc @@ -14,8 +14,7 @@ limitations under the License. */ #include "paddle/phi/backends/dynload/cusparse.h" -namespace phi { -namespace dynload { +namespace phi::dynload { std::once_flag cusparse_dso_flag; void *cusparse_dso_handle; @@ -34,5 +33,4 @@ CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP); CUSPARSE_ROUTINE_EACH_R3(DEFINE_WRAP); #endif -} // namespace dynload -} // namespace phi +} // namespace phi::dynload diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc index 612a959fc307b..5d8e26732196d 100644 --- a/paddle/phi/backends/dynload/dynamic_loader.cc +++ b/paddle/phi/backends/dynload/dynamic_loader.cc @@ -351,14 +351,14 @@ void* GetCublasDsoHandle() { return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib"); #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) { -#ifdef WITH_PIP_CUDA_LIBRARIES +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublas64_11.dll"); #else return GetDsoHandleFromSearchPath( FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path}); #endif } else if (CUDA_VERSION >= 12000 && CUDA_VERSION <= 12030) { -#ifdef WITH_PIP_CUDA_LIBRARIES +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublas64_12.dll"); #else return GetDsoHandleFromSearchPath( @@ -372,13 +372,13 @@ void* GetCublasDsoHandle() { } #elif defined(__linux__) && defined(PADDLE_WITH_CUDA) if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) { -#ifdef WITH_PIP_CUDA_LIBRARIES +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so.11"); #else return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so"); #endif } else if (CUDA_VERSION >= 12000 && CUDA_VERSION <= 12030) { -#ifdef WITH_PIP_CUDA_LIBRARIES +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so.12"); #else return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so"); @@ -400,13 +400,13 @@ void* GetCublasLtDsoHandle() { // APIs available after CUDA 10.1 #if defined(__linux__) && defined(PADDLE_WITH_CUDA) if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) { -#ifdef WITH_PIP_CUDA_LIBRARIES +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so.11"); #else return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so"); #endif } else if (CUDA_VERSION >= 12000 && CUDA_VERSION <= 12030) { -#ifdef WITH_PIP_CUDA_LIBRARIES +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so.12"); #else return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so"); @@ -448,7 +448,7 @@ void* GetCUDNNDsoHandle() { "You should do this according to your CUDA installation directory and " "CUDNN version."); if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12030) { -#ifdef WITH_PIP_CUDA_LIBRARIES +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES return GetDsoHandleFromSearchPath( FLAGS_cuda_dir, "cudnn64_8.dll", true, {cuda_lib_path}, win_warn_meg); #else @@ -456,7 +456,7 @@ void* GetCUDNNDsoHandle() { FLAGS_cuda_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg); #endif } else if (CUDA_VERSION >= 12030) { -#ifdef WITH_PIP_CUDA_LIBRARIES +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES return GetDsoHandleFromSearchPath( FLAGS_cuda_dir, "cudnn64_9.dll", true, {cuda_lib_path}, win_warn_meg); #else @@ -467,7 +467,7 @@ void* GetCUDNNDsoHandle() { #elif defined(PADDLE_WITH_HIP) return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false); #else -#ifdef WITH_PIP_CUDA_LIBRARIES +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES if (CUDA_VERSION >= 12030) { return GetDsoHandleFromSearchPath( FLAGS_cudnn_dir, "libcudnn.so.9", false, {cuda_lib_path}); @@ -488,7 +488,7 @@ void* GetCUPTIDsoHandle() { FLAGS_cupti_dir, "libcupti.dylib", false, {cupti_lib_path}); #elif defined(__linux__) && defined(PADDLE_WITH_CUDA) if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) { -#ifdef WITH_PIP_CUDA_LIBRARIES +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES return GetDsoHandleFromSearchPath( FLAGS_cupti_dir, "libcupti.so.11.8", false, {cupti_lib_path}); #else @@ -497,7 +497,7 @@ void* GetCUPTIDsoHandle() { #endif } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 12030) { -#ifdef WITH_PIP_CUDA_LIBRARIES +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES return GetDsoHandleFromSearchPath( FLAGS_cupti_dir, "libcupti.so.12", false, {cupti_lib_path}); #else @@ -520,7 +520,7 @@ void* GetCurandDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib"); #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) -#ifdef WITH_PIP_CUDA_LIBRARIES +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES return GetDsoHandleFromSearchPath( FLAGS_cuda_dir, "curand64_10.dll", true, {cuda_lib_path}); #else @@ -530,7 +530,7 @@ void* GetCurandDsoHandle() { #elif defined(PADDLE_WITH_HIP) return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so"); #else -#ifdef WITH_PIP_CUDA_LIBRARIES +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES return GetDsoHandleFromSearchPath(FLAGS_curand_dir, "libcurand.so.10"); #else return GetDsoHandleFromSearchPath(FLAGS_curand_dir, "libcurand.so"); @@ -564,7 +564,7 @@ void* GetCusolverDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.dylib"); #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) -#ifdef WITH_PIP_CUDA_LIBRARIES +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES return GetDsoHandleFromSearchPath( FLAGS_cuda_dir, "cusolver64_11.dll", true, {cuda_lib_path}); #else @@ -572,7 +572,7 @@ void* GetCusolverDsoHandle() { FLAGS_cuda_dir, win_cusolver_lib, true, {cuda_lib_path}); #endif #else -#ifdef WITH_PIP_CUDA_LIBRARIES +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so.11"); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so"); @@ -585,14 +585,14 @@ void* GetCusparseDsoHandle() { return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.dylib"); #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) { -#ifdef WITH_PIP_CUDA_LIBRARIES +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cusparse64_11.dll"); #else return GetDsoHandleFromSearchPath( FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path}); #endif } else if (CUDA_VERSION >= 12000 && CUDA_VERSION <= 12030) { -#ifdef WITH_PIP_CUDA_LIBRARIES +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cusparse64_12.dll"); #else return GetDsoHandleFromSearchPath( @@ -606,13 +606,13 @@ void* GetCusparseDsoHandle() { } #elif defined(__linux__) && defined(PADDLE_WITH_CUDA) if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) { -#ifdef WITH_PIP_CUDA_LIBRARIES +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so.11"); #else return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so"); #endif } else if (CUDA_VERSION >= 12000 && CUDA_VERSION <= 12030) { -#ifdef WITH_PIP_CUDA_LIBRARIES +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so.12"); #else return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so"); @@ -716,7 +716,7 @@ void* GetNCCLDsoHandle() { return GetDsoHandleFromSearchPath( FLAGS_rccl_dir, "librccl.so", true, {}, warning_msg); #else -#ifdef WITH_PIP_CUDA_LIBRARIES +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES return GetDsoHandleFromSearchPath( FLAGS_nccl_dir, "libnccl.so;libnccl.so.2", true, {}, warning_msg); #else @@ -782,7 +782,7 @@ void* GetCUFFTDsoHandle() { return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.dylib"); #elif defined(__linux__) && defined(PADDLE_WITH_CUDA) if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) { -#ifdef WITH_PIP_CUDA_LIBRARIES +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so.10"); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so"); @@ -797,14 +797,14 @@ void* GetCUFFTDsoHandle() { } #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) { -#ifdef WITH_PIP_CUDA_LIBRARIES +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cufft64_10.dll"); #else return GetDsoHandleFromSearchPath( FLAGS_cuda_dir, win_cufft_lib, true, {cuda_lib_path}); #endif } else if (CUDA_VERSION >= 12000 && CUDA_VERSION <= 12030) { -#ifdef WITH_PIP_CUDA_LIBRARIES +#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cufft64_11.dll"); #else return GetDsoHandleFromSearchPath( diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc index fc0f8ee1e35e1..d3a569b34c5ac 100644 --- a/paddle/phi/backends/xpu/xpu2_op_list.cc +++ b/paddle/phi/backends/xpu/xpu2_op_list.cc @@ -1055,7 +1055,8 @@ XPUOpMap& get_kl2_ops() { phi::DataType::INT64, phi::DataType::BOOL, phi::DataType::FLOAT64, - phi::DataType::FLOAT32})}, + phi::DataType::FLOAT32, + phi::DataType::FLOAT16})}, {"tile_grad", XPUKernelSet({phi::DataType::FLOAT32})}, {"transpose2_grad", XPUKernelSet({phi::DataType::FLOAT32, @@ -1248,6 +1249,7 @@ XPUOpMap& get_kl2_ops() { XPUKernelSet({phi::DataType::FLOAT16, phi::DataType::FLOAT32})}, {"sequence_unpad_xpu", XPUKernelSet({phi::DataType::FLOAT16, phi::DataType::FLOAT32})}, + {"block_multihead_attention_xpu", XPUKernelSet({phi::DataType::FLOAT16})}, }; return s_xpu2_kernels; diff --git a/paddle/phi/core/distributed/auto_parallel/auto_parallel.proto b/paddle/phi/core/distributed/auto_parallel/auto_parallel.proto index 70c9e72aa5fe7..71c18ac426019 100644 --- a/paddle/phi/core/distributed/auto_parallel/auto_parallel.proto +++ b/paddle/phi/core/distributed/auto_parallel/auto_parallel.proto @@ -25,7 +25,7 @@ message ProcessMeshProto { // There are no duplicate process ids within one process mesh. repeated int64 process_ids = 2; - // The name of each dimension. + // The name of each dimension. repeated string dim_names = 3; } @@ -37,17 +37,17 @@ message TensorDistAttrProto { optional ProcessMeshProto process_mesh = 1; // The length of dims_mapping is same as the length of the tensor shape. - // The i-th dimension of the tensor will be sharded by the dims_mapping[i]-th dimension + // The i-th dimension of the tensor will be sharded by the dims_mapping[i]-th dimension // of the above process mesh. If dims_mapping[i] is -1, the i-th dimension of the tensor // will not be sharded. For example, given a tensor shape [2, 6, 12], a process mesh // shape [2, 3] and a dims_mapping [-1, 1, 0], each sharded tensor will have a shape [2, 2, 6]. repeated int64 dims_mapping = 2; - // The batch dimension of the corresponding tensor. + // The batch dimension of the corresponding tensor. optional int64 batch_dim = 3; - // If the dynamic_dims[i] is True, the i-th dimension of the corresponding tensor - // is dynamic changed. Otherwise, the i-th dimension of the tensor is static determined. + // If the dynamic_dims[i] is True, the i-th dimension of the corresponding tensor + // is dynamic changed. Otherwise, the i-th dimension of the tensor is static determined. repeated bool dynamic_dims = 4; // This field is used to distinguish vars which are in same process_mesh and in different vpp chunk @@ -60,16 +60,16 @@ message OperatorDistAttrProto { message TensorDistAttrMappingEntryProto { optional string name = 1; optional TensorDistAttrProto tensor_dist_attr = 2; - } + } // The key of this map is the input tensor name and the value is the distributed attribute - // of the input tensor required by this corresponding operator. - // The distributed attribute of the actual tensor may be not the same as that within + // of the input tensor required by this corresponding operator. + // The distributed attribute of the actual tensor may be not the same as that within // the distributed attribute of the operator. repeated TensorDistAttrMappingEntryProto input_dist_attrs = 1; // The key of this map is the output tensor name and the value is the distributed attribute - // of the output tensor required by this corresponding operator. - // The distributed attribute of the actual tensor may be not the same as that within + // of the output tensor required by this corresponding operator. + // The distributed attribute of the actual tensor may be not the same as that within // the distributed attribute of the operator. repeated TensorDistAttrMappingEntryProto output_dist_attrs = 2; @@ -81,7 +81,7 @@ message OperatorDistAttrProto { // may shared the same distributed operator, the field is use for this scenario. optional string impl_type = 4; - // This field tells which distributed implementations of this corresponding operator + // This field tells which distributed implementations of this corresponding operator // will be selected for the actual computation. optional int64 impl_idx = 5; @@ -115,13 +115,13 @@ message DeviceProto { optional string type = 4; // The capability of this device. - optional DeviceCapabilityProto capability = 5; + optional DeviceCapabilityProto capability = 5; } -// This proto describes the capability of the link between two devices. -message LinkCapabilityProto { - optional int64 bandwidth = 1; // Bytes/s - optional int64 latency = 2; +// This proto describes the capability of the link between two devices. +message LinkCapabilityProto { + optional int64 bandwidth = 1; // Bytes/s + optional int64 latency = 2; } message LinkProto { @@ -133,14 +133,14 @@ message LinkProto { // Represent the link type. optional string type = 3; - + // The capability of this link. - optional LinkCapabilityProto capability = 4; + optional LinkCapabilityProto capability = 4; } // DeviceMesh is used to organize devices and like n-dimension array. message DeviceMeshProto { - // The global id of this mesh. + // The global id of this mesh. optional string name = 1; // The size of each dimension. @@ -150,13 +150,13 @@ message DeviceMeshProto { // There are no duplicate device ids within one device mesh. repeated int64 device_ids = 3; - // The name of each dimension. + // The name of each dimension. repeated string dim_names = 4; // The devices of this mesh. repeated DeviceProto devices = 5; - // The links are between devices. + // The links are between devices. repeated LinkProto links = 6; } diff --git a/paddle/phi/core/distributed/auto_parallel/dist_attr.cc b/paddle/phi/core/distributed/auto_parallel/dist_attr.cc index 62fbd97c46ab2..98dfa339589a5 100644 --- a/paddle/phi/core/distributed/auto_parallel/dist_attr.cc +++ b/paddle/phi/core/distributed/auto_parallel/dist_attr.cc @@ -21,8 +21,7 @@ limitations under the License. */ #include "glog/logging.h" #include "paddle/phi/core/distributed/auto_parallel/proto_helper.h" -namespace phi { -namespace distributed { +namespace phi::distributed { using phi::distributed::auto_parallel::str_join; using phi::distributed::auto_parallel::TensorDistAttrProto; @@ -450,5 +449,4 @@ bool TensorDistAttr::is_partial(int64_t mesh_axis) const { void TensorDistAttr::set_skip_check_mesh(bool skip) { skip_check_mesh_ = skip; } -} // namespace distributed -} // namespace phi +} // namespace phi::distributed diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/x_to_r_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/reshard/x_to_r_reshard_function.cc index bd415480d64e9..947a4b77f6961 100644 --- a/paddle/phi/core/distributed/auto_parallel/reshard/x_to_r_reshard_function.cc +++ b/paddle/phi/core/distributed/auto_parallel/reshard/x_to_r_reshard_function.cc @@ -25,8 +25,7 @@ #include "paddle/phi/kernels/p_recv_kernel.h" #include "paddle/phi/kernels/p_send_kernel.h" -namespace phi { -namespace distributed { +namespace phi::distributed { bool XToRShrinkReshardFunction::IsSuitable( const DistTensor& in, const TensorDistAttr& out_dist_attr) { @@ -130,5 +129,4 @@ void XToRShrinkReshardFunction::Eval(phi::DeviceContext* dev_ctx, } } -} // namespace distributed -} // namespace phi +} // namespace phi::distributed diff --git a/paddle/phi/core/infermeta_utils.h b/paddle/phi/core/infermeta_utils.h index 494fe160696ff..e63ced99ec539 100644 --- a/paddle/phi/core/infermeta_utils.h +++ b/paddle/phi/core/infermeta_utils.h @@ -49,6 +49,8 @@ class InferMetaContext { void EmplaceBackOutputs( paddle::small_vector outputs); + void UpdataInput(size_t idx, MetaTensor input) { inputs_[idx] = input; } + TEST_API virtual const MetaTensor& InputAt(size_t idx) const; TEST_API virtual std::vector InputsBetween( @@ -68,6 +70,10 @@ class InferMetaContext { const std::pair& InputRangeAt(size_t idx) const; TEST_API const std::pair& OutputRangeAt(size_t idx) const; + size_t InputsSize() const { return inputs_.size(); } + size_t OutputsSize() const { return outputs_.size(); } + size_t AttrsSize() const { return attrs_.size(); } + virtual ~InferMetaContext() = default; protected: diff --git a/paddle/phi/core/kernel_context.h b/paddle/phi/core/kernel_context.h index 947af3af1d089..5fa75214fcfb5 100644 --- a/paddle/phi/core/kernel_context.h +++ b/paddle/phi/core/kernel_context.h @@ -75,6 +75,10 @@ class KernelContext { void AssignOutputRange(std::pair&& range, size_t idx); + void UpdataInput(size_t idx, const TensorBase* input) { + inputs_[idx] = input; + } + template const TensorType& InputAt(size_t idx) const { return static_cast(*(inputs_.at(idx))); diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc index e16ec77a3b0e1..23259d40093af 100644 --- a/paddle/phi/infermeta/backward.cc +++ b/paddle/phi/infermeta/backward.cc @@ -445,6 +445,34 @@ void CudnnLSTMGradInferMeta( } } +void LSTMGradInferMeta(const MetaTensor& input, + const MetaTensor& h0, + const MetaTensor& c0, + const MetaTensor& weight, + const MetaTensor& bias, + MetaTensor* input_grad, + MetaTensor* h0_grad, + MetaTensor* c0_grad, + MetaTensor* weight_grad, + MetaTensor* bias_grad, + MetaConfig config) { + if (input_grad) { + input_grad->share_meta(input); + } + if (h0_grad) { + h0_grad->share_meta(h0); + } + if (c0_grad) { + c0_grad->share_meta(c0); + } + if (weight_grad) { + weight_grad->share_meta(weight); + } + if (bias_grad) { + bias_grad->share_meta(bias); + } +} + void DeformableConvGradInferMeta(const MetaTensor& x, const MetaTensor& offset, const MetaTensor& filter, diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h index e9971b5042ac0..89795c008d34d 100644 --- a/paddle/phi/infermeta/backward.h +++ b/paddle/phi/infermeta/backward.h @@ -167,6 +167,18 @@ void CudnnLSTMGradInferMeta( MetaTensor* init_c_grad, std::vector weight_list_grad); +void LSTMGradInferMeta(const MetaTensor& input, + const MetaTensor& h0, + const MetaTensor& c0, + const MetaTensor& weight, + const MetaTensor& bias, + MetaTensor* input_grad, + MetaTensor* h0_grad, + MetaTensor* c0_grad, + MetaTensor* weight_grad, + MetaTensor* bias_grad, + MetaConfig config = MetaConfig()); + void DeformableConvGradInferMeta(const MetaTensor& x, const MetaTensor& offset, const MetaTensor& filter, diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index 3c3ef874854ab..aa4028efa1a6e 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -3573,6 +3573,45 @@ void TakeAlongAxisInferMeta(const MetaTensor& x, out->set_dtype(x.dtype()); } +void TdmChildInferMeta(const MetaTensor& x, + const MetaTensor& tree_info, + int child_nums, + DataType dtype, + MetaTensor* child, + MetaTensor* leaf_mask) { + PADDLE_ENFORCE_GT( + child_nums, + 0, + phi::errors::InvalidArgument( + "ValueError: The value of the 'child_nums' must greater than 0. " + "But received child_nums value = %d, ", + child_nums)); + + const auto& info_dims = tree_info.dims(); + const auto& input_dims = x.dims(); + + PADDLE_ENFORCE_EQ( + info_dims.size(), + 2, + phi::errors::InvalidArgument( + "ShapeError: The dimensions of the 'tree info' must be 2. " + "But received tree info's dimensions = %d, " + "tree info's shape = [%s].", + info_dims.size(), + info_dims)); + + auto output_dims = common::vectorize(input_dims); + output_dims.push_back(child_nums); + if (child != nullptr) { + child->set_dims(common::make_ddim(output_dims)); + leaf_mask->set_dims(common::make_ddim(output_dims)); + child->share_lod(x); + leaf_mask->share_lod(x); + child->set_dtype(x.dtype()); + leaf_mask->set_dtype(x.dtype()); + } +} + void TriangularSolveInferMeta(const MetaTensor& x, const MetaTensor& y, bool upper, diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index e166746e3a646..391d01debd7a3 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -635,6 +635,13 @@ void TakeAlongAxisInferMeta(const MetaTensor& x, int axis, MetaTensor* out); +void TdmChildInferMeta(const MetaTensor& x, + const MetaTensor& tree_info, + int child_nums, + DataType dtype, + MetaTensor* child, + MetaTensor* leaf_mask); + void TriangularSolveInferMeta(const MetaTensor& x, const MetaTensor& y, bool upper, diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc index 9987524d4997d..5cba3aa1c1a29 100644 --- a/paddle/phi/infermeta/fusion.cc +++ b/paddle/phi/infermeta/fusion.cc @@ -377,6 +377,89 @@ void BlockMultiheadAttentionInferMeta(const MetaTensor& qkv, } } +void BlockMultiheadAttentionInferXPUMeta( + const MetaTensor& qkv, + const MetaTensor& key_cache, + const MetaTensor& value_cache, + const MetaTensor& seq_lens_encoder, + const MetaTensor& seq_lens_decoder, + const MetaTensor& seq_lens_this_time, + const MetaTensor& padding_offsets, + const MetaTensor& cum_offsets, + const MetaTensor& cu_seqlens_q, + const MetaTensor& cu_seqlens_k, + const MetaTensor& cache_k_per_batch_maxs, + const MetaTensor& cache_v_per_batch_maxs, + const MetaTensor& block_tables, + const MetaTensor& pre_key_cache, + const MetaTensor& pre_value_cache, + const MetaTensor& rope_emb, + const MetaTensor& mask, + const MetaTensor& tgt_mask, + const MetaTensor& cache_k_quant_scales, + const MetaTensor& cache_v_quant_scales, + const MetaTensor& cache_k_dequant_scales, + const MetaTensor& cache_v_dequant_scales, + const MetaTensor& qkv_out_scale, + const MetaTensor& qkv_bias, + const MetaTensor& out_shift, + const MetaTensor& out_smooth, + const MetaTensor& max_enc_len_this_time, + const MetaTensor& max_dec_len_this_time, + int max_seq_len, + int block_size, + bool use_neox_style, + bool dynamic_cachekv_quant, + const int quant_round_type, + const float quant_max_bound, + const float quant_min_bound, + const float out_scale, + const std::string& compute_dtype, + MetaTensor* fmha_out, + MetaTensor* qkv_out, + MetaTensor* key_cache_out, + MetaTensor* value_cache_out) { + BlockMultiheadAttentionInferMeta(qkv, + key_cache, + value_cache, + seq_lens_encoder, + seq_lens_decoder, + seq_lens_this_time, + padding_offsets, + cum_offsets, + cu_seqlens_q, + cu_seqlens_k, + block_tables, + pre_key_cache, + pre_value_cache, + rope_emb, + mask, + tgt_mask, + cache_k_quant_scales, + cache_v_quant_scales, + cache_k_dequant_scales, + cache_v_dequant_scales, + qkv_out_scale, + qkv_bias, + out_shift, + out_smooth, + max_enc_len_this_time, + max_dec_len_this_time, + max_seq_len, + block_size, + use_neox_style, + dynamic_cachekv_quant, + quant_round_type, + quant_max_bound, + quant_min_bound, + out_scale, + compute_dtype, + fmha_out, + qkv_out, + key_cache_out, + value_cache_out); +} + void Conv1dXPUInferMeta(const MetaTensor& x, const MetaTensor& x_max, const MetaTensor& filter, diff --git a/paddle/phi/infermeta/fusion.h b/paddle/phi/infermeta/fusion.h index aa48f64434ee3..989c0dd28a1b4 100644 --- a/paddle/phi/infermeta/fusion.h +++ b/paddle/phi/infermeta/fusion.h @@ -128,6 +128,49 @@ void BlockMultiheadAttentionInferMeta(const MetaTensor& qkv, MetaTensor* key_cache_out, MetaTensor* value_cache_out); +void BlockMultiheadAttentionInferXPUMeta( + const MetaTensor& qkv, + const MetaTensor& key_cache, + const MetaTensor& value_cache, + const MetaTensor& seq_lens_encoder, + const MetaTensor& seq_lens_decoder, + const MetaTensor& seq_lens_this_time, + const MetaTensor& padding_offsets, + const MetaTensor& cum_offsets, + const MetaTensor& cu_seqlens_q, + const MetaTensor& cu_seqlens_k, + const MetaTensor& cache_k_per_batch_maxs, + const MetaTensor& cache_v_per_batch_maxs, + const MetaTensor& block_tables, + const MetaTensor& pre_key_cache, + const MetaTensor& pre_value_cache, + const MetaTensor& rope_emb, + const MetaTensor& mask, + const MetaTensor& tgt_mask, + const MetaTensor& cache_k_quant_scales, + const MetaTensor& cache_v_quant_scales, + const MetaTensor& cache_k_dequant_scales, + const MetaTensor& cache_v_dequant_scales, + const MetaTensor& qkv_out_scale, + const MetaTensor& qkv_bias, + const MetaTensor& out_shift, + const MetaTensor& out_smooth, + const MetaTensor& max_enc_len_this_time, + const MetaTensor& max_dec_len_this_time, + int max_seq_len, + int block_size, + bool use_neox_style, + bool dynamic_cachekv_quant, + const int quant_round_type, + const float quant_max_bound, + const float quant_min_bound, + const float out_scale, + const std::string& compute_dtype, + MetaTensor* fmha_out, + MetaTensor* qkv_out, + MetaTensor* key_cache_out, + MetaTensor* value_cache_out); + void Conv1dXPUInferMeta(const MetaTensor& x, const MetaTensor& x_max, const MetaTensor& filter, diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index a80997970f8fb..84d0e7ffaf469 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -1349,6 +1349,115 @@ void CudnnLSTMInferMeta( state_out->set_dtype(phi::DataType::UINT8); } +void LSTMInferMeta(const MetaTensor& input, + const MetaTensor& h0, + const MetaTensor& c0, + const MetaTensor& weight, + const MetaTensor& bias, + bool use_peepholes, + bool is_reverse, + bool is_test, + const std::string& gate_activation, + const std::string& cell_activation, + const std::string& candidate_activation, + MetaTensor* hidden, + MetaTensor* cell, + MetaTensor* batch_gate, + MetaTensor* batch_cell_pre_act, + MetaConfig config) { + const auto& in_dims = input.dims(); + PADDLE_ENFORCE_EQ( + in_dims.size(), + 2, + phi::errors::InvalidArgument( + "Input(X)'s rank must be 2, but received %d.", in_dims.size())); + + if (h0) { + PADDLE_ENFORCE_EQ( + c0.initialized(), + true, + phi::errors::NotFound("Input(Cell) and Input(Hidden) of LSTM " + "should not be null at the same time.")); + const auto& h_dims = h0.dims(); + const auto& c_dims = c0.dims(); + PADDLE_ENFORCE_EQ(h_dims, + c_dims, + phi::errors::InvalidArgument( + "The dimension of Input(H0) and Input(C0) should " + "be the same, but received [%s] (H0) vs [%s] (C0).", + h_dims, + c_dims)); + } + + int frame_size = static_cast(in_dims[1] / 4); + const auto& w_dims = weight.dims(); + PADDLE_ENFORCE_EQ( + w_dims.size(), + 2, + phi::errors::InvalidArgument( + "The rank of Input(Weight) should be 2, but received %d.", + w_dims.size())); + PADDLE_ENFORCE_EQ(w_dims[0], + frame_size, + phi::errors::InvalidArgument( + "The first dimension of Input(Weight) should be %d, " + "but received %d.", + frame_size, + w_dims[0])); + PADDLE_ENFORCE_EQ(w_dims[1], + 4 * frame_size, + phi::errors::InvalidArgument( + "The second dimension of Input(Weight) should be 4 * " + "%d, but received %d.", + frame_size, + w_dims[1])); + + const auto& b_dims = bias.dims(); + PADDLE_ENFORCE_EQ(b_dims.size(), + 2, + phi::errors::InvalidArgument( + "The rank of Input(Bias) should be 2, but received %d.", + b_dims.size())); + PADDLE_ENFORCE_EQ( + b_dims[0], + 1, + phi::errors::InvalidArgument( + "The first dimension of Input(Bias) should be 1, but received %d.", + b_dims[0])); + + if (use_peepholes) { + PADDLE_ENFORCE_EQ( + b_dims[1], + 7 * frame_size, + phi::errors::InvalidArgument( + "The second dimension of Input(Bias) should be 7 * %d if enable " + "peepholes connection, but received %d.", + frame_size, + b_dims[1])); + } else { + PADDLE_ENFORCE_EQ( + b_dims[1], + 4 * frame_size, + phi::errors::InvalidArgument( + "The second dimension of Input(Bias) should be 4 * %d if disable " + "peepholes connection, but received %d.", + frame_size, + b_dims[1])); + } + + phi::DDim out_dims({in_dims[0], frame_size}); + hidden->set_dims(out_dims); + cell->set_dims(out_dims); + if (!is_test) { + batch_gate->set_dims(in_dims); + batch_cell_pre_act->set_dims(out_dims); + } + hidden->share_lod(input); + cell->share_lod(input); + hidden->set_dtype(input.dtype()); + cell->set_dtype(input.dtype()); +} + void DecayedAdagradInferMeta(const MetaTensor& param, const MetaTensor& grad, const MetaTensor& moment, diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 56dff7422b2cc..a73212505f669 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -292,6 +292,23 @@ void CudnnLSTMInferMeta( MetaTensor* reserve, MetaTensor* state_out); +void LSTMInferMeta(const MetaTensor& input, + const MetaTensor& h0, + const MetaTensor& c0, + const MetaTensor& weight, + const MetaTensor& bias, + bool use_peepholes, + bool is_reverse, + bool is_test, + const std::string& gate_activation, + const std::string& cell_activation, + const std::string& candidate_activation, + MetaTensor* hidden, + MetaTensor* cell, + MetaTensor* batch_gate, + MetaTensor* batch_cell_pre_act, + MetaConfig config = MetaConfig()); + void DecayedAdagradInferMeta(const MetaTensor& param, const MetaTensor& grad, const MetaTensor& moment, diff --git a/paddle/phi/infermeta/spmd_rules/flatten.cc b/paddle/phi/infermeta/spmd_rules/flatten.cc index a0f084b491771..b33411e4b2518 100644 --- a/paddle/phi/infermeta/spmd_rules/flatten.cc +++ b/paddle/phi/infermeta/spmd_rules/flatten.cc @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h" #include "paddle/phi/core/distributed/auto_parallel/utils.h" #include "paddle/phi/infermeta/spmd_rules/dim_trans.h" +#include "paddle/phi/infermeta/spmd_rules/reshape.h" #include "paddle/phi/infermeta/spmd_rules/utils.h" namespace phi { @@ -105,41 +106,31 @@ SpmdInfo FlattenInferSpmd(const DistMetaTensor& x, x_ndim, x_dims_mapping.size())); - // Step1: Build the transformation from - // the original shape to the target shape - + // obtain target shape and use ReshapeInferSpmdDynamic to infer start_axis = PreprocessAxis(start_axis, x_ndim); stop_axis = PreprocessAxis(stop_axis, x_ndim); - std::vector> trans = - MakeFlattenDimTrans(src_shape, start_axis, stop_axis); - - // Step2: Infer the dims mapping of input (if reshard is - // needed) and output from the dimension transformation. - std::vector> dims_mapping_vec = - InferFromDimTrans(x, trans); - - // Step3: Update the dist attributes of input - // and output with the inferred dims mapping. - TensorDistAttr x_dist_attr_dst = CopyTensorDistAttrForOutput(x_dist_attr_src); - x_dist_attr_dst.set_dims_mapping(dims_mapping_vec[0]); - TensorDistAttr out_dist_attr = CopyTensorDistAttrForOutput(x_dist_attr_src); - out_dist_attr.set_dims_mapping(dims_mapping_vec[1]); + std::vector dst_shape; + int64_t flatten_size = 1; + for (int64_t i = 0; i < x_ndim; i++) { + if (i < start_axis || i > stop_axis) { + dst_shape.emplace_back(src_shape[i]); + } else { + flatten_size *= src_shape[i]; + if (i == stop_axis) { + dst_shape.emplace_back(flatten_size); + } + } + } VLOG(4) << "FlattenInferSpmd: X shape: [" << str_join(src_shape) << "]"; VLOG(4) << "Start_axis: " << start_axis; - VLOG(4) << "Stop_axis: " << start_axis; - VLOG(4) << "Transformation from input to output:"; - for (int64_t i = 0, n = static_cast(trans.size()); i < n; i++) { - std::shared_ptr t = trans[i]; - VLOG(4) << "\tOut axis[" << i << "]: " << t->to_string(); - } - VLOG(4) << "X dims_mapping_src: [" << str_join(x_dims_mapping) - << "] dims_mapping_dst: [" << str_join(dims_mapping_vec[0]) << "]"; - VLOG(4) << "Out dims_mapping: [" << str_join(dims_mapping_vec[1]) << "]\n\n"; - - return {{x_dist_attr_dst}, {out_dist_attr}}; + VLOG(4) << "Stop_axis: " << stop_axis; + VLOG(4) << "FlattenInferSpmd: output shape: [" << str_join(dst_shape) << "]"; + VLOG(4) << "use ReshapeInferSpmdDynamic to infer distributed attribute"; + return ReshapeInferSpmdDynamic(x, dst_shape); } +// TODO(jeff41404): consider xshape and use ReshapeInferSpmdReverse in future SpmdInfo FlattenInferSpmdReverse(const DistMetaTensor& x, const DistMetaTensor& out, int start_axis, @@ -198,5 +189,10 @@ SpmdInfo FlattenInferSpmdReverse(const DistMetaTensor& x, return {{x_dist_attr}, {out_dist_attr_dst}}; } +SpmdInfo FlattenGradInferSpmd(const DistMetaTensor& xshape, + const DistMetaTensor& out_grad) { + return ReshapeGradInferSpmd(xshape, out_grad); +} + } // namespace distributed } // namespace phi diff --git a/paddle/phi/infermeta/spmd_rules/flatten.h b/paddle/phi/infermeta/spmd_rules/flatten.h index bb62d8c0d7b0a..28bf5e56d5256 100644 --- a/paddle/phi/infermeta/spmd_rules/flatten.h +++ b/paddle/phi/infermeta/spmd_rules/flatten.h @@ -30,5 +30,8 @@ SpmdInfo FlattenInferSpmdReverse(const DistMetaTensor& x, const DistMetaTensor& out, int start_axis, int stop_axis); + +SpmdInfo FlattenGradInferSpmd(const DistMetaTensor& xshape, + const DistMetaTensor& out_grad); } // namespace distributed } // namespace phi diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc index 6c278867d9ac3..d4731ce7afd3c 100644 --- a/paddle/phi/infermeta/ternary.cc +++ b/paddle/phi/infermeta/ternary.cc @@ -645,6 +645,141 @@ void GlobalScatterInferMeta(const MetaTensor& x, out->set_dtype(x.dtype()); } +void AddGroupNormSiluInferMeta(const MetaTensor& x, + const MetaTensor& residual, + const MetaTensor& scale, + const MetaTensor& bias, + float epsilon, + int groups, + const std::string& data_layout_str, + const std::string& activation, + MetaTensor* y, + MetaTensor* residual_out, + MetaTensor* mean, + MetaTensor* variance) { + PADDLE_ENFORCE_NE(y, + nullptr, + phi::errors::InvalidArgument( + "The y in GroupNormInferMeta can't be nullptr.")); + PADDLE_ENFORCE_NE(mean, + nullptr, + phi::errors::InvalidArgument( + "The mean in GroupNormInferMeta can't be nullptr.")); + PADDLE_ENFORCE_NE( + variance, + nullptr, + phi::errors::InvalidArgument( + "The variance in GroupNormInferMeta can't be nullptr.")); + + auto x_dim = x.dims(); + PADDLE_ENFORCE_GE( + x_dim.size(), + 2, + phi::errors::InvalidArgument( + "The Input(X)'s dimension of Op(group_norm) must be " + "greater than 1. But received: %u-D Tensor, which shape is [%s].", + x_dim.size(), + x_dim)); + + const DataLayout data_layout = common::StringToDataLayout(data_layout_str); + const int64_t channel_num = + (data_layout == DataLayout::kNCHW ? x_dim[1] : x_dim[x_dim.size() - 1]); + auto batch_size = x_dim[0]; + PADDLE_ENFORCE_LE( + groups, + channel_num, + phi::errors::InvalidArgument( + "The Attr(groups) of Op(group_norm) must be less than or " + "equal to the number of channels. But received: groups " + "is [%s], channels is [%s], the Attr(data_layout) " + "is [%s]. The error may come from wrong data_layout setting.", + groups, + channel_num, + data_layout_str)); + PADDLE_ENFORCE_GE( + groups, + 1, + phi::errors::InvalidArgument( + "The Attr(groups) of Op(group_norm) must be " + "greater than or equal to 1. But received: groups is [%s].", + groups)); + PADDLE_ENFORCE_EQ( + channel_num % groups, + 0, + phi::errors::InvalidArgument( + "Expected number of channels in input to be divisible by " + "num_groups, but got input channel is %d and num_groups is %d", + channel_num, + groups)); + + if (scale) { + PADDLE_ENFORCE_EQ( + scale.dims().size(), + 1UL, + phi::errors::InvalidArgument( + "The Input(Scale) of Op(group_norm) should be 1-D Tensor. " + "But received: %u-D Tensor, the shape of Input(Scale) is [%s].", + scale.dims().size(), + scale.dims())); + PADDLE_ENFORCE_EQ( + scale.dims()[0], + channel_num, + phi::errors::InvalidArgument( + "The Input(Scale)'s first dimension size of Op(group_norm) must " + "be equal to the number of channels. But received: the " + "Input(Scale)'s first dimension size is [%s], the channels is " + "[%s], the Attr(data_layout) is [%s]. The error may come " + "from wrong data_layout setting.", + scale.dims()[0], + channel_num, + data_layout_str)); + } + if (bias) { + PADDLE_ENFORCE_EQ( + bias.dims().size(), + 1UL, + phi::errors::InvalidArgument( + "The Input(Bias) of Op(group_norm) should be 1-D Tensor. " + "But received: %u-D Tensor, the shape of Input(Bias) is [%s].", + bias.dims().size(), + bias.dims())); + PADDLE_ENFORCE_EQ( + bias.dims()[0], + channel_num, + phi::errors::InvalidArgument( + "The Input(Bias)'s first dimension size of " + "Op(group_norm) must be equal to the number of channels. " + "But received: the Input(Bias)'s first dimension size is [%s], " + "the channels is [%s], the Attr(data_layout) is [%s]. The " + "error may come from wrong data_layout setting.", + bias.dims()[0], + channel_num, + data_layout_str)); + } + y->set_dims(x_dim); + y->set_dtype(x.dtype()); + y->share_lod(x); + + phi::DataType x_dtype = x.dtype(); + phi::DataType param_type = + (x_dtype == phi::DataType::BFLOAT16 || x_dtype == phi::DataType::FLOAT16) + ? phi::DataType::FLOAT32 + : x_dtype; + if (mean) { + mean->set_dims({batch_size, groups}); + mean->set_dtype(param_type); + } + if (variance) { + variance->set_dims({batch_size, groups}); + variance->set_dtype(param_type); + } + if (residual_out) { + residual_out->set_dims(x_dim); + residual_out->set_dtype(x.dtype()); + residual_out->share_lod(x); + } +} + void GroupNormInferMeta(const MetaTensor& x, const MetaTensor& scale, const MetaTensor& bias, diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h index 8732a87c55cd6..1b276846619e6 100644 --- a/paddle/phi/infermeta/ternary.h +++ b/paddle/phi/infermeta/ternary.h @@ -144,6 +144,19 @@ void GlobalScatterInferMeta(const MetaTensor& x, bool use_calc_stream, MetaTensor* out); +void AddGroupNormSiluInferMeta(const MetaTensor& x, + const MetaTensor& residual, + const MetaTensor& scale, + const MetaTensor& bias, + float epsilon, + int groups, + const std::string& data_layout, + const std::string& activation, + MetaTensor* y, + MetaTensor* residual_out, + MetaTensor* mean, + MetaTensor* variance); + void GroupNormInferMeta(const MetaTensor& x, const MetaTensor& scale, const MetaTensor& bias, diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 0aca647dd6a49..96d34a0157e8d 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -55,6 +55,21 @@ if(DEFINED REDUCE_INFERENCE_LIB_SIZE) endif() if(WITH_CUTLASS) + add_custom_target( + gemm_epilogue_compile_script ALL + COMMAND bash compile.sh "${PYTHON_EXECUTABLE}" "${CUDA_TOOLKIT_ROOT_DIR}" + \"${NVCC_ARCH_BIN}\" "${CMAKE_COMMAND}" + WORKING_DIRECTORY + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/gemm_epilogue + COMMENT "GemmEpilogue compile script") + add_custom_target( + fused_conv2d_add_act_compile_script ALL + COMMAND bash compile.sh "${PYTHON_EXECUTABLE}" "${CUDA_TOOLKIT_ROOT_DIR}" + \"${NVCC_ARCH_BIN}\" "${CMAKE_COMMAND}" + WORKING_DIRECTORY + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/conv2d + COMMENT "FusedConv2dAddAct compile script") + execute_process( COMMAND ${PYTHON_EXECUTABLE} diff --git a/paddle/phi/kernels/cpu/cumprod_kernel.cc b/paddle/phi/kernels/cpu/cumprod_kernel.cc index f39bddbb443ba..422f566c6612e 100644 --- a/paddle/phi/kernels/cpu/cumprod_kernel.cc +++ b/paddle/phi/kernels/cpu/cumprod_kernel.cc @@ -32,8 +32,16 @@ void CumprodKernel(const Context& dev_ctx, DenseTensor* out) { const DenseTensor* x = &input; auto* x_data = x->data(); - auto* out_data = dev_ctx.template Alloc(out); + auto* out_ptr = dev_ctx.template Alloc(out); DDim shape = x->dims(); + DenseTensor out_tmp; + T* out_data = nullptr; + if (x_data == out_ptr) { + out_tmp.Resize(shape); + out_data = dev_ctx.template Alloc(&out_tmp); + } else { + out_data = out_ptr; + } size_t outer_dim = 1; size_t mid_dim = 1; @@ -88,6 +96,9 @@ void CumprodKernel(const Context& dev_ctx, } } } + if (x_data == out_ptr) { + memcpy(out_ptr, out_data, out->numel() * sizeof(T)); + } } } // namespace phi diff --git a/paddle/phi/kernels/cpu/inverse_grad_kernel.cc b/paddle/phi/kernels/cpu/inverse_grad_kernel.cc index 97c10e69c8eab..5014cfd0f95c7 100644 --- a/paddle/phi/kernels/cpu/inverse_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/inverse_grad_kernel.cc @@ -16,5 +16,11 @@ #include "paddle/phi/core/kernel_registry.h" -PD_REGISTER_KERNEL( - inverse_grad, CPU, ALL_LAYOUT, phi::InverseGradKernel, float, double) {} +PD_REGISTER_KERNEL(inverse_grad, + CPU, + ALL_LAYOUT, + phi::InverseGradKernel, + float, + double, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/inverse_kernel.cc b/paddle/phi/kernels/cpu/inverse_kernel.cc index 4b21718eca3f2..6fecef6f888dc 100644 --- a/paddle/phi/kernels/cpu/inverse_kernel.cc +++ b/paddle/phi/kernels/cpu/inverse_kernel.cc @@ -16,5 +16,11 @@ #include "paddle/phi/core/kernel_registry.h" -PD_REGISTER_KERNEL( - inverse, CPU, ALL_LAYOUT, phi::InverseKernel, float, double) {} +PD_REGISTER_KERNEL(inverse, + CPU, + ALL_LAYOUT, + phi::InverseKernel, + float, + double, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/fluid/operators/ops_signature/number_count_sig.cc b/paddle/phi/kernels/cpu/lstm_grad_kernel.cc similarity index 58% rename from paddle/fluid/operators/ops_signature/number_count_sig.cc rename to paddle/phi/kernels/cpu/lstm_grad_kernel.cc index 48e0b4fce9ac1..ddaa85c8bdce1 100644 --- a/paddle/fluid/operators/ops_signature/number_count_sig.cc +++ b/paddle/phi/kernels/cpu/lstm_grad_kernel.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,15 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/core/compat/op_utils.h" +#include +#include +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/lstm_kernel_impl.h" -namespace phi { - -KernelSignature NumberCountOpArgumentMapping( - const ArgumentMappingContext& ctx) { - return KernelSignature("number_count", {"numbers"}, {"upper_range"}, {"Out"}); -} - -} // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(number_count, phi::NumberCountOpArgumentMapping); +PD_REGISTER_KERNEL( + lstm_grad, CPU, ALL_LAYOUT, phi::LSTMGradKernel, float, double) {} diff --git a/paddle/fluid/operators/ops_signature/channel_shuffle_sig.cc b/paddle/phi/kernels/cpu/lstm_kernel.cc similarity index 50% rename from paddle/fluid/operators/ops_signature/channel_shuffle_sig.cc rename to paddle/phi/kernels/cpu/lstm_kernel.cc index d3bf58bdec3c8..848ba68bb3b76 100644 --- a/paddle/fluid/operators/ops_signature/channel_shuffle_sig.cc +++ b/paddle/phi/kernels/cpu/lstm_kernel.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,19 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/core/compat/op_utils.h" +#include +#include +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/lstm_kernel_impl.h" -namespace phi { - -KernelSignature ChannelShuffleGradOpArgumentMapping( - const ArgumentMappingContext& ctx UNUSED) { - return KernelSignature("channel_shuffle_grad", - {"Out@GRAD"}, - {"groups", "data_format"}, - {"X@GRAD"}); -} - -} // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(channel_shuffle_grad, - phi::ChannelShuffleGradOpArgumentMapping); +PD_REGISTER_KERNEL(lstm, CPU, ALL_LAYOUT, phi::LSTMKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/meshgrid_grad_kernel.cc b/paddle/phi/kernels/cpu/meshgrid_grad_kernel.cc index 5b43fb02b5117..9d1319e0b5e4a 100644 --- a/paddle/phi/kernels/cpu/meshgrid_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/meshgrid_grad_kernel.cc @@ -25,4 +25,6 @@ PD_REGISTER_KERNEL(meshgrid_grad, float, double, int, - int64_t) {} + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/meshgrid_kernel.cc b/paddle/phi/kernels/cpu/meshgrid_kernel.cc index 35e43f7bbc85e..a0239da6bb128 100644 --- a/paddle/phi/kernels/cpu/meshgrid_kernel.cc +++ b/paddle/phi/kernels/cpu/meshgrid_kernel.cc @@ -25,4 +25,6 @@ PD_REGISTER_KERNEL(meshgrid, float, double, int, - int64_t) {} + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/tdm_child_kernel.cc b/paddle/phi/kernels/cpu/tdm_child_kernel.cc index 246f2113d65e8..3fabbba572f7e 100644 --- a/paddle/phi/kernels/cpu/tdm_child_kernel.cc +++ b/paddle/phi/kernels/cpu/tdm_child_kernel.cc @@ -104,7 +104,7 @@ void TDMChildKernel(const Context &dev_ctx, const phi::DenseTensor &x, const phi::DenseTensor &tree_info, int child_nums, - int dtype, + phi::DataType dtype, phi::DenseTensor *child, phi::DenseTensor *leaf_mask) { const auto &input_type = x.dtype(); @@ -132,7 +132,7 @@ void TDMChildKernel(const Context &dev_ctx, DataTypeToString(DataType::INT32), DataTypeToString(DataType::INT64))); - auto output_type = phi::TransToPhiDataType(dtype); + auto output_type = dtype; bool out_type_match = output_type == DataType::INT32 || output_type == DataType::INT64; PADDLE_ENFORCE_EQ(out_type_match, diff --git a/paddle/phi/kernels/cpu/tile_kernel.cc b/paddle/phi/kernels/cpu/tile_kernel.cc index 2320c30310a64..30eb1d5cd6c47 100644 --- a/paddle/phi/kernels/cpu/tile_kernel.cc +++ b/paddle/phi/kernels/cpu/tile_kernel.cc @@ -27,5 +27,6 @@ PD_REGISTER_KERNEL(tile, double, int, int64_t, + phi::dtype::float16, phi::dtype::complex, phi::dtype::complex) {} diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h index 96b2128eee16c..a58b5998a6703 100644 --- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h +++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h @@ -685,6 +685,63 @@ struct CUBlas> { ldb, batch_size)); } + + static void GETRF_BATCH(cublasHandle_t handle, + int n, + phi::dtype::complex **A, + int lda, + int *ipiv, + int *info, + int batch_size) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCgetrfBatched( + handle, + n, + reinterpret_cast(A), + lda, + ipiv, + info, + batch_size)); + } + + static void GETRI_BATCH(cublasHandle_t handle, + int n, + const phi::dtype::complex **A, + int lda, + const int *ipiv, + phi::dtype::complex **Ainv, + int ldc, + int *info, + int batch_size) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCgetriBatched( + handle, + n, + reinterpret_cast(A), + lda, + ipiv, + reinterpret_cast(Ainv), + ldc, + info, + batch_size)); + } + + static void MATINV_BATCH(cublasHandle_t handle, + int n, + const phi::dtype::complex **A, + int lda, + phi::dtype::complex **Ainv, + int lda_inv, + int *info, + int batch_size) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCmatinvBatched( + handle, + n, + reinterpret_cast(A), + lda, + reinterpret_cast(Ainv), + lda_inv, + info, + batch_size)); + } }; template <> @@ -923,6 +980,63 @@ struct CUBlas> { "cublasGemmEx is not supported on cuda <= 7.5")); #endif } + + static void GETRF_BATCH(cublasHandle_t handle, + int n, + phi::dtype::complex **A, + int lda, + int *ipiv, + int *info, + int batch_size) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZgetrfBatched( + handle, + n, + reinterpret_cast(A), + lda, + ipiv, + info, + batch_size)); + } + + static void GETRI_BATCH(cublasHandle_t handle, + int n, + const phi::dtype::complex **A, + int lda, + const int *ipiv, + phi::dtype::complex **Ainv, + int ldc, + int *info, + int batch_size) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZgetriBatched( + handle, + n, + reinterpret_cast(A), + lda, + ipiv, + reinterpret_cast(Ainv), + ldc, + info, + batch_size)); + } + + static void MATINV_BATCH(cublasHandle_t handle, + int n, + const phi::dtype::complex **A, + int lda, + phi::dtype::complex **Ainv, + int lda_inv, + int *info, + int batch_size) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZmatinvBatched( + handle, + n, + reinterpret_cast(A), + lda, + reinterpret_cast(Ainv), + lda_inv, + info, + batch_size)); + } }; template <> diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cc b/paddle/phi/kernels/funcs/concat_and_split_functor.cc index fd49748666a6e..c42bbbd3a5318 100644 --- a/paddle/phi/kernels/funcs/concat_and_split_functor.cc +++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cc @@ -14,8 +14,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/concat_and_split_functor.h" -namespace phi { -namespace funcs { +namespace phi::funcs { /* * All tensors' dimension should be the same and the values of @@ -132,5 +131,4 @@ struct SplitFunctor { FOR_ALL_TYPES(DEFINE_FUNCTOR); -} // namespace funcs -} // namespace phi +} // namespace phi::funcs diff --git a/paddle/phi/kernels/funcs/lstm_utils.h b/paddle/phi/kernels/funcs/lstm_utils.h new file mode 100644 index 0000000000000..4a02b097fd340 --- /dev/null +++ b/paddle/phi/kernels/funcs/lstm_utils.h @@ -0,0 +1,36 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/mixed_vector.h" +#include "paddle/phi/kernels/funcs/detail/activation_functions.h" +#include "paddle/phi/kernels/funcs/sequence2batch.h" + +namespace phi { + +template +inline void ReorderInitState(const Context& dev_ctx, + const phi::DenseTensor& src, + phi::Vector index_lod, + phi::DenseTensor* dst, + bool indexed_src) { + phi::funcs::CopyMatrixRowsFunctor row_shuffle; + dst->Resize(src.dims()); + dev_ctx.template Alloc(dst); + row_shuffle(dev_ctx, src, index_lod, dst, indexed_src); +} +} // namespace phi diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cc b/paddle/phi/kernels/funcs/matrix_inverse.cc index c316970e6a560..2a3749ef36b81 100644 --- a/paddle/phi/kernels/funcs/matrix_inverse.cc +++ b/paddle/phi/kernels/funcs/matrix_inverse.cc @@ -16,8 +16,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/blas/blas.h" -namespace phi { -namespace funcs { +namespace phi::funcs { template void MatrixInverseFunctor::operator()(const Context& dev_ctx, @@ -28,6 +27,7 @@ void MatrixInverseFunctor::operator()(const Context& dev_ctx, template class MatrixInverseFunctor; template class MatrixInverseFunctor; +template class MatrixInverseFunctor>; +template class MatrixInverseFunctor>; -} // namespace funcs -} // namespace phi +} // namespace phi::funcs diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu index c0ea7ad84c41b..f46dd714c9f55 100644 --- a/paddle/phi/kernels/funcs/matrix_inverse.cu +++ b/paddle/phi/kernels/funcs/matrix_inverse.cu @@ -131,6 +131,8 @@ void MatrixInverseFunctor::operator()(const Context& dev_ctx, template class MatrixInverseFunctor; template class MatrixInverseFunctor; +template class MatrixInverseFunctor>; +template class MatrixInverseFunctor>; } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/matrix_inverse.h b/paddle/phi/kernels/funcs/matrix_inverse.h index f0cd265a54648..d45f7d8863a63 100644 --- a/paddle/phi/kernels/funcs/matrix_inverse.h +++ b/paddle/phi/kernels/funcs/matrix_inverse.h @@ -25,14 +25,69 @@ limitations under the License. */ namespace phi { namespace funcs { +template +struct MapMatrixInverseFunctor { + void operator()( + const Context& dev_ctx, const T* a_ptr, T* a_inv_ptr, int offset, int n) { + using Matrix = + Eigen::Matrix; + using EigenMatrixMap = Eigen::Map; + using ConstEigenMatrixMap = Eigen::Map; + + ConstEigenMatrixMap mat(a_ptr + offset, n, n); + EigenMatrixMap mat_inv(a_inv_ptr + offset, n, n); + Eigen::PartialPivLU lu; + lu.compute(mat); + + const T min_abs_pivot = lu.matrixLU().diagonal().cwiseAbs().minCoeff(); + PADDLE_ENFORCE_GT(min_abs_pivot, + static_cast(0), + errors::InvalidArgument("Input is not invertible.")); + mat_inv.noalias() = lu.inverse(); + } +}; + +template +struct MapMatrixInverseFunctor> { + void operator()(const Context& dev_ctx, + const phi::dtype::complex* a_ptr, + phi::dtype::complex* a_inv_ptr, + int offset, + int n) { + using Matrix = Eigen::Matrix, + Eigen::Dynamic, + Eigen::Dynamic, + Eigen::RowMajor>; + using EigenMatrixMap = Eigen::Map; + using ConstEigenMatrixMap = Eigen::Map; + std::complex* std_ptr = new std::complex[n * n]; + std::complex* std_inv_ptr = new std::complex[n * n]; + for (int i = 0; i < n * n; i++) { + *(std_ptr + i) = static_cast>(*(a_ptr + offset + i)); + } + ConstEigenMatrixMap mat(std_ptr, n, n); + EigenMatrixMap mat_inv(std_inv_ptr, n, n); + Eigen::PartialPivLU lu; + lu.compute(mat); + + const T min_abs_pivot = lu.matrixLU().diagonal().cwiseAbs().minCoeff(); + PADDLE_ENFORCE_NE(min_abs_pivot, + static_cast>(0), + errors::InvalidArgument("Input is not invertible.")); + mat_inv.noalias() = lu.inverse(); + for (int i = 0; i < n * n; i++) { + *(a_inv_ptr + offset + i) = + static_cast>(*(std_inv_ptr + i)); + } + delete[] std_ptr; + delete[] std_inv_ptr; + } +}; + template void ComputeInverseEigen(const Context& dev_ctx, const DenseTensor& a, DenseTensor* a_inv) { - using Matrix = - Eigen::Matrix; - using EigenMatrixMap = Eigen::Map; - using ConstEigenMatrixMap = Eigen::Map; const auto& mat_dims = a.dims(); const int rank = mat_dims.size(); int n = mat_dims[rank - 1]; @@ -41,17 +96,13 @@ void ComputeInverseEigen(const Context& dev_ctx, const T* a_ptr = a.data(); T* a_inv_ptr = dev_ctx.template Alloc(a_inv); + // Putting phi::dtype::complex into eigen::matrix has a problem, + // it's not going to get the right result, + // so we're going to convert it to std::complex and + // then we're going to put it into eigen::matrix. for (int i = 0; i < batch_size; ++i) { - ConstEigenMatrixMap mat(a_ptr + i * n * n, n, n); - EigenMatrixMap mat_inv(a_inv_ptr + i * n * n, n, n); - Eigen::PartialPivLU lu; - lu.compute(mat); - - const T min_abs_pivot = lu.matrixLU().diagonal().cwiseAbs().minCoeff(); - PADDLE_ENFORCE_GT(min_abs_pivot, - static_cast(0), - errors::InvalidArgument("Input is not invertible.")); - mat_inv.noalias() = lu.inverse(); + MapMatrixInverseFunctor functor; + functor(dev_ctx, a_ptr, a_inv_ptr, i * n * n, n); } } diff --git a/paddle/phi/kernels/funcs/sequence_pooling.cc b/paddle/phi/kernels/funcs/sequence_pooling.cc index f4ee9c323366e..1fdaadfea01a1 100644 --- a/paddle/phi/kernels/funcs/sequence_pooling.cc +++ b/paddle/phi/kernels/funcs/sequence_pooling.cc @@ -21,8 +21,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/jit/kernels.h" #include "paddle/phi/kernels/funcs/math_function.h" -namespace phi { -namespace funcs { +namespace phi::funcs { template ; template class SequencePoolGradFunctor; template class SequencePoolGradFunctor; -} // namespace funcs -} // namespace phi +} // namespace phi::funcs diff --git a/paddle/phi/kernels/fusion/cpu/fusion_seqconv_eltadd_relu_kernel.cc b/paddle/phi/kernels/fusion/cpu/fusion_seqconv_eltadd_relu_kernel.cc index 4ff18849316d8..456d3370990cb 100644 --- a/paddle/phi/kernels/fusion/cpu/fusion_seqconv_eltadd_relu_kernel.cc +++ b/paddle/phi/kernels/fusion/cpu/fusion_seqconv_eltadd_relu_kernel.cc @@ -23,8 +23,7 @@ #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/fc_functor.h" -namespace phi { -namespace fusion { +namespace phi::fusion { template void FusionSeqConvEltAddReluKernel(const Context& dev_ctx, @@ -148,8 +147,7 @@ void FusionSeqConvEltAddReluKernel(const Context& dev_ctx, true); } -} // namespace fusion -} // namespace phi +} // namespace phi::fusion PD_REGISTER_KERNEL(fusion_seqconv_eltadd_relu, CPU, diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/CMakeLists.txt b/paddle/phi/kernels/fusion/cutlass/conv2d/CMakeLists.txt index d760ce773c135..abcf220aa5c54 100644 --- a/paddle/phi/kernels/fusion/cutlass/conv2d/CMakeLists.txt +++ b/paddle/phi/kernels/fusion/cutlass/conv2d/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.23) +cmake_minimum_required(VERSION 3.18) if(NOT DEFINED PYTHON_EXECUTABLE) message( diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/compile.sh b/paddle/phi/kernels/fusion/cutlass/conv2d/compile.sh index eb13c7dd6723d..8ac34b55144df 100644 --- a/paddle/phi/kernels/fusion/cutlass/conv2d/compile.sh +++ b/paddle/phi/kernels/fusion/cutlass/conv2d/compile.sh @@ -13,21 +13,38 @@ # limitations under the License. set -e -cutlass_repo_directory="cutlass" -if [ ! -d "$cutlass_repo_directory" ]; then - git clone --branch v3.0.0 https://github.com/NVIDIA/cutlass -fi - build_directory="build" if [ ! -d "$build_directory" ]; then mkdir $build_directory fi -python_exe_path="python" -cuda_root_path="/usr/local/cuda" -gpu_cc="80" +libname="$build_directory/libCutlassConv2d.so" +if [ -e "$libname" ]; then + exit 0 +fi + +default_python_exe_path="/usr/bin/python" +default_cuda_root_path="/usr/local/cuda" +default_gpu_cc="80" +default_cmake_command="cmake" + +python_exe_path="${1:-$default_python_exe_path}" +cuda_root_path="${2:-$default_cuda_root_path}" +gpu_cc="${3:-$default_gpu_cc}" +cmake_command="${4:-$default_cmake_command}" + +case "$gpu_cc" in + 75|80|86|89) ;; + *) exit 0 ;; +esac + +cutlass_repo_directory="cutlass" +if [ ! -d "$cutlass_repo_directory" ]; then + git clone --branch v3.0.0 https://github.com/NVIDIA/cutlass +fi + cd $build_directory -cmake .. -DPYTHON_EXECUTABLE=$python_exe_path -DCUDA_TOOLKIT_ROOT_DIR=$cuda_root_path -DCOMPUTE_CAPABILITY=$gpu_cc -make -j +$cmake_command .. -DPYTHON_EXECUTABLE=$python_exe_path -DCUDA_TOOLKIT_ROOT_DIR=$cuda_root_path -DCOMPUTE_CAPABILITY=$gpu_cc +make -j8 cd - diff --git a/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/CMakeLists.txt b/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/CMakeLists.txt index 6ad5035e9dcd6..fc9cfa1cfd919 100644 --- a/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/CMakeLists.txt +++ b/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.23) +cmake_minimum_required(VERSION 3.18) if(NOT DEFINED PYTHON_EXECUTABLE) message( diff --git a/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/compile.sh b/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/compile.sh index f8a5463239a95..4352cb6381354 100644 --- a/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/compile.sh +++ b/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/compile.sh @@ -13,21 +13,38 @@ # limitations under the License. set -e -cutlass_repo_directory="cutlass" -if [ ! -d "$cutlass_repo_directory" ]; then - git clone --branch v2.11.0 https://github.com/NVIDIA/cutlass -fi - build_directory="build" if [ ! -d "$build_directory" ]; then mkdir $build_directory fi -python_exe_path="/usr/bin/python" -cuda_root_path="/usr/local/cuda" -gpu_cc="80" +libname="$build_directory/libCutlassGemmEpilogue.so" +if [ -e "$libname" ]; then + exit 0 +fi + +default_python_exe_path="/usr/bin/python" +default_cuda_root_path="/usr/local/cuda" +default_gpu_cc="80" +default_cmake_command="cmake" + +python_exe_path="${1:-$default_python_exe_path}" +cuda_root_path="${2:-$default_cuda_root_path}" +gpu_cc="${3:-$default_gpu_cc}" +cmake_command="${4:-$default_cmake_command}" + +case "$gpu_cc" in + 80|86|89) ;; + *) exit 0 ;; +esac + +cutlass_repo_directory="cutlass" +if [ ! -d "$cutlass_repo_directory" ]; then + git clone --branch v2.11.0 https://github.com/NVIDIA/cutlass +fi + cd $build_directory -cmake .. -DPYTHON_EXECUTABLE=$python_exe_path -DCUDA_TOOLKIT_ROOT_DIR=$cuda_root_path -DCOMPUTE_CAPABILITY=$gpu_cc -make -j +$cmake_command .. -DPYTHON_EXECUTABLE=$python_exe_path -DCUDA_TOOLKIT_ROOT_DIR=$cuda_root_path -DCOMPUTE_CAPABILITY=$gpu_cc +make -j8 cd - diff --git a/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/gemm_epilogue_util.h b/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/gemm_epilogue_util.h index 8f1be5983f646..8b36a43fdf843 100644 --- a/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/gemm_epilogue_util.h +++ b/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/gemm_epilogue_util.h @@ -13,6 +13,8 @@ // limitations under the License. #pragma once +#include +#include #include #include "paddle/phi/kernels/fusion/cutlass/gemm_epilogue/gemm_epilogue_decl.h" diff --git a/paddle/phi/kernels/fusion/xpu/block_multi_head_attention_kernel.cc b/paddle/phi/kernels/fusion/xpu/block_multi_head_attention_kernel.cc new file mode 100644 index 0000000000000..b38a0b1c00dc2 --- /dev/null +++ b/paddle/phi/kernels/fusion/xpu/block_multi_head_attention_kernel.cc @@ -0,0 +1,606 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "glog/logging.h" +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/common/memory_utils.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/flash_attn_kernel.h" +#include "xpu/xdnn.h" + +namespace phi { +namespace fusion { + +template +int GetMaxLen(const Context& dev_ctx, + const phi::DenseTensor& seq_lens_tensor, + phi::DenseTensor* max_len_tensor, + const int batch_size) { + int max_len_cpu = 0; + int r = baidu::xpu::api::reduce_max(dev_ctx.x_context(), + seq_lens_tensor.data(), + max_len_tensor->data(), + {batch_size}, + {0}); + PD_CHECK(r == 0, "baidu::xpu::api::reduce_max failed."); + xpu_wait(dev_ctx.x_context()->xpu_stream); + r = xpu_memcpy(&max_len_cpu, + max_len_tensor->data(), + sizeof(int), + XPUMemcpyKind::XPU_DEVICE_TO_HOST); + PD_CHECK(r == 0, "xpu_memcpy failed."); + return max_len_cpu; +} + +template +void qkv_split_rope_kernel(const Context& xpu_ctx, + const DenseTensor& qkv_input, + const DenseTensor& rotary_emb, + const DenseTensor& seq_lens, + const baidu::xpu::api::VectorParam& lods, + int bsz, + int max_seq_len, + int token_num, + int num_head, + int dim_head, + DenseTensor* q_out, + DenseTensor* k_out, + DenseTensor* v_out) { + xpu::ctx_guard RAII_GUARD(xpu_ctx.x_context()); + using XPUType = typename XPUTypeTrait::Type; + auto q_data = reinterpret_cast(q_out->data()); + auto k_data = reinterpret_cast(k_out->data()); + auto v_data = reinterpret_cast(v_out->data()); + int r = baidu::xpu::api::split( + xpu_ctx.x_context(), + reinterpret_cast(qkv_input.data()), + {q_data, k_data, v_data}, + {token_num, 3, num_head * dim_head}, + {1, 1, 1}, + 1); + const_cast(&qkv_input)->clear(); + PD_CHECK(r == 0, "baidu::xpu::api::split failed."); + r = baidu::xpu::api::vsl_rotary_neox_embedding( + xpu_ctx.x_context(), + q_data, + k_data, + rotary_emb.data(), + q_data, + k_data, + lods, + 1, + max_seq_len, + num_head, + dim_head, + "BLHD", + {}, + "NORMAL", + -1); + PD_CHECK(r == 0, "baidu::xpu::api::vsl_rotary_neox_embedding failed."); +} + +template +void BlockMultiheadAttentionXPUKernel( + const Context& dev_ctx, + const DenseTensor& qkv, + const DenseTensor& key_cache, + const DenseTensor& value_cache, + const DenseTensor& seq_lens_encoder, + const DenseTensor& seq_lens_decoder, + const DenseTensor& seq_lens_this_time, + const DenseTensor& padding_offsets, + const DenseTensor& cum_offsets, + const DenseTensor& cu_seqlens_q, + const DenseTensor& cu_seqlens_k, + const DenseTensor& block_tables, + const DenseTensor& cache_k_per_batch_maxs, + const DenseTensor& cache_v_per_batch_maxs, + const paddle::optional& pre_key_cache, + const paddle::optional& pre_value_cache, + const paddle::optional& rope_emb, + const paddle::optional& mask, + const paddle::optional& tgt_mask, + const paddle::optional& cache_k_quant_scales, + const paddle::optional& cache_v_quant_scales, + const paddle::optional& cache_k_dequant_scales, + const paddle::optional& cache_v_dequant_scales, + const paddle::optional& qkv_out_scale, + const paddle::optional& qkv_bias, + const paddle::optional& out_shift, + const paddle::optional& out_smooth, + const paddle::optional& max_enc_len_this_time, + const paddle::optional& max_dec_len_this_time, + int max_seq_len, + int block_size, + bool use_neox_style, + const bool dynamic_cachekv_quant, + const int quant_round_type, + const float quant_max_bound, + const float quant_min_bound, + const float out_scale, + const std::string& compute_dtype, + DenseTensor* fmha_out, + DenseTensor* qkv_out, + DenseTensor* key_cache_out, + DenseTensor* value_cache_out) { + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); + auto xpu_context = dev_ctx.x_context(); + + using XPUType = typename XPUTypeTrait::Type; + + phi::DenseTensor qkv_buf; + phi::DenseTensor fmha_buf; + VLOG(3) << "fmha_out " << fmha_out->dims(); + if (out_scale <= 0) { + dev_ctx.template Alloc(fmha_out); + fmha_buf = *fmha_out; + } else { + PADDLE_THROW(phi::errors::Unimplemented("Not supports out_scale > 0.")); + } + int r = xpu::constant(xpu_context, + reinterpret_cast(fmha_buf.data()), + fmha_buf.numel(), + 0); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); + const auto& input_dims = qkv.dims(); + const auto& key_cache_dims = key_cache.dims(); + const int token_num = input_dims[0]; + const int num_head = key_cache_dims[1]; + const int dim_head = key_cache_dims[3]; + const int bsz = cum_offsets.dims()[0]; + const int max_block_per_seq = block_tables.dims()[1]; + VLOG(3) << "bsz: " << bsz << " token_num: " << token_num + << " num_head: " << num_head << " dim_head: " << dim_head + << " max_block_per_seq: " << max_block_per_seq; + VLOG(3) << "fmha_out_dims: " << fmha_out->dims(); + bool causual = true; + if (mask) { + causual = false; + } + bool use_pre_cache = false; + int pre_cache_length = 0; + if (pre_key_cache) { + PADDLE_THROW(phi::errors::Unimplemented("Not supports pre_key_cache now.")); + } + VLOG(3) << "token_num: " << token_num + << " pre_cache_length: " << pre_cache_length; + + int max_dec_len_this_time_data(0); + if (!max_dec_len_this_time) { + phi::DenseTensor max_dec_len_tensor; + max_dec_len_tensor.Resize({{1}}); + dev_ctx.template Alloc(&max_dec_len_tensor, + max_dec_len_tensor.numel() * sizeof(int)); + max_dec_len_this_time_data = + GetMaxLen(dev_ctx, seq_lens_decoder, &max_dec_len_tensor, bsz); + } else { + PADDLE_ENFORCE_EQ( + max_dec_len_this_time.get().place().GetType(), + phi::AllocationType::CPU, + errors::InvalidArgument( + "The place of input max_dec_len_this_time must be CPU, but got %s.", + max_dec_len_this_time.get().place())); + max_dec_len_this_time_data = *max_dec_len_this_time.get().data(); + } + int max_enc_len_this_time_data(0); + if (!max_enc_len_this_time) { + phi::DenseTensor max_enc_len_tensor; + max_enc_len_tensor.Resize({{1}}); + dev_ctx.template Alloc(&max_enc_len_tensor, + max_enc_len_tensor.numel() * sizeof(int)); + max_enc_len_this_time_data = + GetMaxLen(dev_ctx, seq_lens_encoder, &max_enc_len_tensor, bsz); + } else { + PADDLE_ENFORCE_EQ( + max_enc_len_this_time.get().place().GetType(), + phi::AllocationType::CPU, + errors::InvalidArgument( + "The place of input max_enc_len_this_time must be CPU, but got %s.", + max_enc_len_this_time.get().place())); + max_enc_len_this_time_data = *max_enc_len_this_time.get().data(); + } + + const int MAXPTR_N = xpu_context->max_ptr_size(); + VLOG(3) << "max_len end"; + phi::DenseTensor unpadding_q, unpadding_k, unpadding_v; + phi::DenseTensor softmax_out, softmax_lse, seed_offset; + phi::DenseTensor q_trans, k_trans, v_trans, qktv_out; + if (!use_pre_cache) { + unpadding_q.Resize({{token_num, num_head, dim_head}}); + unpadding_k.Resize({{token_num, num_head, dim_head}}); + unpadding_v.Resize({{token_num, num_head, dim_head}}); + + dev_ctx.template Alloc(&unpadding_q, unpadding_q.numel() * sizeof(T)); + dev_ctx.template Alloc(&unpadding_k, unpadding_k.numel() * sizeof(T)); + dev_ctx.template Alloc(&unpadding_v, unpadding_v.numel() * sizeof(T)); + } else { + PADDLE_THROW(phi::errors::Unimplemented("Not supports pre_key_cache now.")); + } + VLOG(3) << "encoder"; + VLOG(3) << "max_enc_len_this_time_data: " << max_enc_len_this_time_data; + if (qkv_out_scale) { + PADDLE_THROW(phi::errors::Unimplemented("Not supports qkv_out_scale now.")); + } else { + VLOG(1) << "qkv_out_scale is none"; + qkv_buf = qkv; + } + if (qkv_bias) { + PADDLE_THROW(phi::errors::Unimplemented("Not supports qkv_bias now.")); + } + std::vector lods_cpu(bsz + 1, 0); + xpu_wait(xpu_context->xpu_stream); + xpu_memcpy(lods_cpu.data() + 1, + seq_lens_this_time.data(), + sizeof(int32_t) * bsz, + XPUMemcpyKind::XPU_DEVICE_TO_HOST); + for (int i = 1; i < bsz + 1; i++) { + lods_cpu[i] += lods_cpu[i - 1]; + } + using XPUType = typename XPUTypeTrait::Type; + baidu::xpu::api::VectorParam lods = + baidu::xpu::api::VectorParam{lods_cpu.data(), bsz + 1, nullptr} + .to_xpu(RAII_GUARD); + float* p_batch_max_ptrs = RAII_GUARD.alloc_l3_or_gm(bsz); + + if (!rope_emb || !use_neox_style) { + PADDLE_THROW(phi::errors::Unimplemented( + "only supports use_neox_style rope_emb now.")); + } + if (max_enc_len_this_time_data > 0) { + // const int* sequence_lengths_data = seq_lens_encoder.data(); + qkv_split_rope_kernel(dev_ctx, + qkv, + rope_emb.get(), + seq_lens_encoder, + lods, + bsz, + rope_emb.get().dims()[2], + token_num, + num_head, + dim_head, + &unpadding_q, + &unpadding_k, + &unpadding_v); + + VLOG(3) << "rope end"; + VLOG(3) << "causual: " << causual; + if (!use_pre_cache) { + phi::FlashAttnUnpaddedKernel(dev_ctx, + unpadding_q, + unpadding_k, + unpadding_v, + cu_seqlens_q, + cu_seqlens_k, + paddle::none /*fixed_seed_offset*/, + causual ? paddle::none : mask, + max_enc_len_this_time_data, + max_enc_len_this_time_data, + 1.0f / sqrt(static_cast(dim_head)), + 0.0, + causual, + false, + true /* is_test*/, + "" /*rng_name*/, + &fmha_buf, + &softmax_out, + &softmax_lse, + &seed_offset); + } else { + PADDLE_THROW( + phi::errors::Unimplemented("Not supports use_pre_cache now.")); + } + VLOG(3) << "flash end"; + if (cache_k_quant_scales && dynamic_cachekv_quant) { + PADDLE_THROW(phi::errors::Unimplemented("Not supports quant now.")); + } else { + std::vector start_token_ctx(bsz, 0); + xpu::VectorParam start_token_ctx_VP = + xpu::VectorParam{ + start_token_ctx.data(), + static_cast(start_token_ctx.size()), + nullptr} + .to_xpu(RAII_GUARD); + + std::vector ordered_index_ctx(bsz, 0); + std::iota(ordered_index_ctx.begin(), ordered_index_ctx.end(), 0); + xpu::VectorParam ordered_index_ctx_VP = + xpu::VectorParam{ + ordered_index_ctx.data(), static_cast(bsz), nullptr} + .to_xpu(RAII_GUARD); + int ret = xpu::reshape_cached_kv( + xpu_context, + reinterpret_cast(unpadding_k.data()), + reinterpret_cast(const_cast(key_cache.data())), + block_tables.data(), + lods, + start_token_ctx_VP, + ordered_index_ctx_VP, + bsz, + num_head, + dim_head, + bsz, + block_size, + max_block_per_seq, + "BLHD", + "HLD"); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reshape_cached_kv"); + ret = xpu::batch_findmax( + xpu_context, + reinterpret_cast(const_cast(key_cache.data())), + token_num, + num_head * dim_head, + bsz, + lods.xpu, + p_batch_max_ptrs); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "batch_findmax"); + ret = xpu::copy2d( + xpu_context, + p_batch_max_ptrs, + const_cast(cache_k_per_batch_maxs.data()), + bsz, + 1, + MAXPTR_N, + 1); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "copy2d"); + ret = xpu::reshape_cached_kv( + xpu_context, + reinterpret_cast(unpadding_v.data()), + reinterpret_cast(const_cast(value_cache.data())), + block_tables.data(), + lods, + start_token_ctx_VP, + ordered_index_ctx_VP, + bsz, + num_head, + dim_head, + bsz, + block_size, + max_block_per_seq, + "BLHD", + "HLD"); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reshape_cached_kv"); + ret = xpu::batch_findmax( + xpu_context, + reinterpret_cast(const_cast(value_cache.data())), + token_num, + num_head * dim_head, + bsz, + lods.xpu, + p_batch_max_ptrs); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "batch_findmax"); + ret = xpu::copy2d( + xpu_context, + p_batch_max_ptrs, + const_cast(cache_v_per_batch_maxs.data()), + bsz, + 1, + MAXPTR_N, + 1); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "copy2d"); + } + VLOG(3) << "cache end"; + } + VLOG(3) << "encoder done"; + VLOG(3) << "max_dec_len_this_time_data: " << max_dec_len_this_time_data; + + if (max_dec_len_this_time_data > 0) { + int cachekv_quant_mode = 0; + if (cache_k_quant_scales || cachekv_quant_mode) { + PADDLE_THROW(phi::errors::Unimplemented( + "Not supports cache_k_quant_scales or cachekv_quant_mode now.")); + } + + qkv_split_rope_kernel(dev_ctx, + qkv, + rope_emb.get(), + seq_lens_encoder, + lods, + bsz, + rope_emb.get().dims()[2], + token_num, + num_head, + dim_head, + &unpadding_q, + &unpadding_k, + &unpadding_v); + + std::vector kv_seq_lod_dec(bsz + 1, 0); + std::iota(kv_seq_lod_dec.begin(), kv_seq_lod_dec.end(), 0); + xpu::VectorParam kv_seq_lod_dec_VP = + xpu::VectorParam{kv_seq_lod_dec.data(), + static_cast(kv_seq_lod_dec.size()), + nullptr} + .to_xpu(RAII_GUARD); + std::vector start_token_ctx(bsz, 0); + for (int i = 0; i < bsz; i++) { + start_token_ctx[i] = lods_cpu[i + 1] - lods_cpu[i]; + } + xpu::VectorParam start_token_ctx_VP = + xpu::VectorParam{start_token_ctx.data(), + static_cast(start_token_ctx.size()), + nullptr} + .to_xpu(RAII_GUARD); + + std::vector ordered_index_ctx(bsz, 0); + std::iota(ordered_index_ctx.begin(), ordered_index_ctx.end(), 0); + xpu::VectorParam ordered_index_ctx_VP = + xpu::VectorParam{ + ordered_index_ctx.data(), static_cast(bsz), nullptr} + .to_xpu(RAII_GUARD); + + float* p_batch_max_ptrs_fill = + RAII_GUARD.alloc_l3_or_gm(bsz * MAXPTR_N); + int ret = xpu::constant( + xpu_context, p_batch_max_ptrs_fill, bsz * MAXPTR_N, 0.0); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "constant"); + float* p_cache_k_max_data = RAII_GUARD.alloc_l3_or_gm(MAXPTR_N); + float* p_cache_v_max_data = RAII_GUARD.alloc_l3_or_gm(MAXPTR_N); + ret = xpu::reshape_cached_kv( + xpu_context, + reinterpret_cast(unpadding_k.data()), + reinterpret_cast(const_cast(key_cache.data())), + block_tables.data(), + kv_seq_lod_dec_VP, + start_token_ctx_VP, + ordered_index_ctx_VP, + bsz, + num_head, + dim_head, + bsz, + block_size, + max_block_per_seq, + "BLHD", + "HLD"); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reshape_cached_kv"); + ret = xpu::batch_findmax( + xpu_context, + reinterpret_cast(unpadding_k.data()), + bsz, + num_head * dim_head, + p_batch_max_ptrs); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "batch_findmax"); + unpadding_k.clear(); + ret = xpu::copy2d(xpu_context, + p_batch_max_ptrs, + p_batch_max_ptrs_fill, + bsz, + 1, + MAXPTR_N, + 1); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "copy2d"); + ret = xpu::max( + xpu_context, + cache_k_per_batch_maxs.data(), + p_batch_max_ptrs_fill, + const_cast(cache_k_per_batch_maxs.data()), + bsz * MAXPTR_N); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "max"); + ret = xpu::findmax( + xpu_context, + const_cast(cache_k_per_batch_maxs.data()), + p_cache_k_max_data, + bsz * MAXPTR_N); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "findmax"); + ret = xpu::reshape_cached_kv( + xpu_context, + reinterpret_cast(unpadding_v.data()), + reinterpret_cast(const_cast(value_cache.data())), + block_tables.data(), + kv_seq_lod_dec_VP, + start_token_ctx_VP, + ordered_index_ctx_VP, + bsz, + num_head, + dim_head, + bsz, + block_size, + max_block_per_seq, + "BLHD", + "HLD"); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reshape_cached_kv"); + ret = xpu::batch_findmax( + xpu_context, + reinterpret_cast(unpadding_v.data()), + bsz, + num_head * dim_head, + p_batch_max_ptrs); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "batch_findmax"); + unpadding_v.clear(); + ret = xpu::copy2d(xpu_context, + p_batch_max_ptrs, + p_batch_max_ptrs_fill, + bsz, + 1, + MAXPTR_N, + 1); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "copy2d"); + ret = xpu::max( + xpu_context, + cache_v_per_batch_maxs.data(), + p_batch_max_ptrs_fill, + const_cast(cache_v_per_batch_maxs.data()), + bsz * MAXPTR_N); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "max"); + ret = xpu::findmax( + xpu_context, + const_cast(cache_v_per_batch_maxs.data()), + p_cache_v_max_data, + bsz * MAXPTR_N); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "findmax"); + + VLOG(1) << "cachekv_quant_mode " << cachekv_quant_mode; + std::vector lods_decoder_cpu(bsz + 1, 0); + xpu_wait(xpu_context->xpu_stream); + xpu_memcpy(lods_decoder_cpu.data() + 1, + seq_lens_decoder.data(), + sizeof(int32_t) * bsz, + XPUMemcpyKind::XPU_DEVICE_TO_HOST); + for (int i = 1; i < bsz + 1; i++) { + lods_decoder_cpu[i] += lods_decoder_cpu[i - 1]; + } + std::vector qkvlod_dec(2 * (bsz + 1), 0); + for (int bs = 0; bs < bsz; bs++) { + qkvlod_dec[bs + 1] = bs + 1; + qkvlod_dec[bsz + 1 + bs + 1] = lods_decoder_cpu[bs + 1] + 1; + } + auto qkvlod_dec_vp = + xpu::VectorParam{ + qkvlod_dec.data(), static_cast(qkvlod_dec.size()), nullptr} + .to_xpu(RAII_GUARD); + xpu::DecodeAttnParam decoder_attn_vsl_param( + qkvlod_dec_vp, max_seq_len, num_head, dim_head, -1, 0, bsz, {}); + xpu::PageAttnParam page_param( + block_size, bsz, max_block_per_seq, ordered_index_ctx_VP, 0, "HLD"); + float* max_q_ptr = RAII_GUARD.alloc_l3_or_gm(MAXPTR_N); + ret = xpu::findmax(xpu_context, + reinterpret_cast(unpadding_q.data()), + max_q_ptr, + token_num * num_head * dim_head); + + ret = xpu::qkv_paged_attention( + xpu_context, + reinterpret_cast(unpadding_q.data()), + reinterpret_cast(const_cast(key_cache.data())), + reinterpret_cast(const_cast(value_cache.data())), + block_tables.data(), // [pagep.max_batch_size, + // pagep.max_num_blocks_per_seq] + reinterpret_cast(fmha_buf.data()), + max_q_ptr, + p_cache_k_max_data, // shape=[6], nullptr if pagep.quant_type == 1 + p_cache_v_max_data, // shape=[6], nullptr if pagep.quant_type == 1 + nullptr, + decoder_attn_vsl_param, // attention 相关参数 + page_param); // page attention 相关参数 + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "qkv_paged_attention"); + } + VLOG(3) << "decoder done"; +} +} // namespace fusion +} // namespace phi + +PD_REGISTER_KERNEL(block_multihead_attention_xpu, + XPU, + ALL_LAYOUT, + phi::fusion::BlockMultiheadAttentionXPUKernel, + phi::dtype::float16) { + kernel->InputAt(26).SetBackend(phi::Backend::CPU); + kernel->InputAt(27).SetBackend(phi::Backend::CPU); +} diff --git a/paddle/phi/kernels/fusion/xpu/fused_layernorm_kernel.cc b/paddle/phi/kernels/fusion/xpu/fused_layernorm_kernel.cc index 833caa6688787..cac0182feaa2b 100644 --- a/paddle/phi/kernels/fusion/xpu/fused_layernorm_kernel.cc +++ b/paddle/phi/kernels/fusion/xpu/fused_layernorm_kernel.cc @@ -63,6 +63,11 @@ void FusedLayerNormKernel(const Context& dev_ctx, dev_ctx.template Alloc(&residual_alpha_tmp); dev_ctx.template Alloc(&residual_alpha_ptr); + r = baidu::xpu::api::constant(xpu_ctx->x_context(), + reinterpret_cast(out->data()), + out->numel(), + static_cast(0.f)); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); r = baidu::xpu::api::constant(xpu_ctx->x_context(), residual_alpha_tmp.data(), diff --git a/paddle/phi/kernels/gpu/group_norm_kernel.cu b/paddle/phi/kernels/gpu/group_norm_kernel.cu index 4835b643efcc7..720447ea41a0e 100644 --- a/paddle/phi/kernels/gpu/group_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/group_norm_kernel.cu @@ -123,6 +123,17 @@ inline __device__ void UpdateSum(const T* srcX, float* sum, float* sumSq) { *sumSq += src_data * src_data; } +template +inline __device__ void UpdateSum(const T* srcX, + const T* srcR, + float* sum, + float* sumSq) { + float src_data = phi::__2float(*srcX); + float srcy_data = phi::__2float(*srcR); + *sum += src_data + srcy_data; + *sumSq += (src_data + srcy_data) * (src_data + srcy_data); +} + template <> inline __device__ void UpdateSum<__half, 2>(const __half* srcX, float* sum, @@ -133,6 +144,20 @@ inline __device__ void UpdateSum<__half, 2>(const __half* srcX, *sumSq += f2.x * f2.x + f2.y * f2.y; } +template <> +inline __device__ void UpdateSum<__half, 2>(const __half* srcX, + const __half* srcR, + float* sum, + float* sumSq) { + __half2 h2 = *reinterpret_cast<__half2 const*>(srcX); + __half2 h2_r = *reinterpret_cast<__half2 const*>(srcR); + float2 f2 = __half22float2(h2); + float2 f2_r = __half22float2(h2_r); + *sum += f2.x + f2_r.x + f2.y + f2_r.y; + *sumSq += + (f2.x + f2_r.x) * (f2.x + f2_r.x) + (f2.y + f2_r.y) * (f2.y + f2_r.y); +} + template <> inline __device__ void UpdateSum( const phi::dtype::float16* srcX, float* sum, float* sumSq) { @@ -142,6 +167,21 @@ inline __device__ void UpdateSum( *sumSq += f2.x * f2.x + f2.y * f2.y; } +template <> +inline __device__ void UpdateSum( + const phi::dtype::float16* srcX, + const phi::dtype::float16* srcR, + float* sum, + float* sumSq) { + __half2 h2 = *reinterpret_cast<__half2 const*>(srcX); + __half2 h2_r = *reinterpret_cast<__half2 const*>(srcR); + float2 f2 = __half22float2(h2); + float2 f2_r = __half22float2(h2_r); + *sum += f2.x + f2_r.x + f2.y + f2_r.y; + *sumSq += + (f2.x + f2_r.x) * (f2.x + f2_r.x) + (f2.y + f2_r.y) * (f2.y + f2_r.y); +} + #ifdef PADDLE_CUDA_BF16 template <> inline __device__ void UpdateSum( @@ -151,6 +191,21 @@ inline __device__ void UpdateSum( *sum += f2.x + f2.y; *sumSq += f2.x * f2.x + f2.y * f2.y; } + +template <> +inline __device__ void UpdateSum( + const phi::dtype::bfloat16* srcX, + const phi::dtype::bfloat16* srcR, + float* sum, + float* sumSq) { + __nv_bfloat162 h2 = *reinterpret_cast<__nv_bfloat162 const*>(srcX); + __nv_bfloat162 h2_r = *reinterpret_cast<__nv_bfloat162 const*>(srcR); + float2 f2 = phi::bfloat1622float2(h2); + float2 f2_r = phi::bfloat1622float2(h2_r); + *sum += f2.x + f2_r.x + f2.y + f2_r.y; + *sumSq += + (f2.x + f2_r.x) * (f2.x + f2_r.x) + (f2.y + f2_r.y) * (f2.y + f2_r.y); +} #endif template @@ -177,7 +232,13 @@ __global__ void groupNormNDHWCSumSingerChannelKernel( int64_t offset = static_cast(ni) * params.dhwc + static_cast(dhwi) * params.c + ci; float src_data = *reinterpret_cast(¶ms.srcX[offset]); - UpdateSum(¶ms.srcX[offset], &sum, &sumSq); + if (params.srcR != nullptr) { + int64_t g_offset = params.y_same_with_x ? offset : ci; + UpdateSum( + ¶ms.srcX[offset], ¶ms.srcR[g_offset], &sum, &sumSq); + } else { + UpdateSum(¶ms.srcX[offset], &sum, &sumSq); + } } smem[threadIdx.x] = make_float2(sum, sumSq); @@ -185,7 +246,6 @@ __global__ void groupNormNDHWCSumSingerChannelKernel( __syncthreads(); float2 sums = smem[threadIdx.x]; - atomicAdd(¶ms.redBuffer[(2 * ni + 0) * params.groups + ci], sums.x * params.invDHWC); atomicAdd(¶ms.redBuffer[(2 * ni + 1) * params.groups + ci], sums.y); @@ -209,7 +269,8 @@ __global__ void groupNormNDHWCSumKernel(const GroupNormNDHWCParams params) { if (ci >= params.c || threadIdx.x * THREADS_PER_CHANNEL >= params.cPerBlock) { return; } - // The first activation loaded by that block. + int32_t gj = ci / params.cPerGroup; + int32_t cj = ci % params.cPerGroup; int32_t dhwBegin = blockIdx.y * params.dhwPerBlock; // The last activation loaded by that block. int32_t dhwEnd = min(dhwBegin + params.dhwPerBlock, params.dhw); @@ -223,13 +284,19 @@ __global__ void groupNormNDHWCSumKernel(const GroupNormNDHWCParams params) { int64_t offset = static_cast(ni) * params.dhwc + static_cast(dhwi) * params.c + ci; float src_data = *reinterpret_cast(¶ms.srcX[offset]); - UpdateSum(¶ms.srcX[offset], &sum, &sumSq); + if (params.srcR != nullptr) { + int64_t g_offset = + params.y_same_with_x ? offset : gj * params.cPerGroup + cj; + UpdateSum( + ¶ms.srcX[offset], ¶ms.srcR[g_offset], &sum, &sumSq); + } else { + UpdateSum(¶ms.srcX[offset], &sum, &sumSq); + } } // The group that thread works on and the channel in the group (modulus). int32_t gi = ci / params.cPerGroup - blockIdx.x * params.cPerBlock / params.cPerGroup; - int32_t cj = ci % params.cPerGroup; int flag = (cj == 0 || threadIdx.x == 0) ? 1 : 0; GroupSums inp{flag, sum, sumSq}; GroupSums out; @@ -243,7 +310,6 @@ __global__ void groupNormNDHWCSumKernel(const GroupNormNDHWCParams params) { __syncthreads(); - int32_t gj = ci / params.cPerGroup; if (cj == params.cPerGroup - THREADS_PER_CHANNEL || threadIdx.x * THREADS_PER_CHANNEL == params.cPerBlock - THREADS_PER_CHANNEL) { @@ -351,7 +417,15 @@ inline __device__ void GroupNormCompute(int32_t dhwBegin, for (int32_t dhwi = dhwBegin; dhwi < dhwEnd; ++dhwi) { // The src/dst offset. int64_t offset = (int64_t)blockIdx.z * params.dhwc + dhwi * params.c + ci; - const float src_data = phi::__2float(params.srcX[offset]); + float src_data = phi::__2float(params.srcX[offset]); + if (params.srcR != nullptr) { + auto gi = ci / params.cPerGroup; + auto gj = ci % params.cPerGroup; + int64_t g_offset = + params.y_same_with_x ? offset : gi * params.cPerGroup + gj; + src_data += phi::__2float(params.srcR[g_offset]); + *reinterpret_cast(¶ms.eleOut[offset]) = phi::__2dst(src_data); + } // Normalize the channels. float dst_data = (src_data - mean) * invStdDev; // Scale by gamma and add beta. @@ -392,6 +466,18 @@ inline __device__ void GroupNormCompute( // Extract the two half values. float2 f2 = __half22float2(h2); + if (params.srcR != nullptr) { + auto gi = ci / params.cPerGroup; + auto gj = ci % params.cPerGroup; + int64_t g_offset = + params.y_same_with_x ? offset : gi * params.cPerGroup + gj; + __half2 r2 = *reinterpret_cast<__half2 const*>(¶ms.srcR[g_offset]); + float2 r_f2 = __half22float2(r2); + f2.x += r_f2.x; + f2.y += r_f2.y; + *reinterpret_cast<__half2*>(¶ms.eleOut[offset]) = + __float22half2_rn(f2); + } // Normalize the channels. f2.x = (f2.x - mean) * invStdDev; f2.y = (f2.y - mean) * invStdDev; @@ -434,7 +520,18 @@ inline __device__ void GroupNormCompute<__half, 2>( // Extract the two half values. float2 f2 = __half22float2(h2); - + if (params.srcR != nullptr) { + auto gi = ci / params.cPerGroup; + auto gj = ci % params.cPerGroup; + int64_t g_offset = + params.y_same_with_x ? offset : gi * params.cPerGroup + gj; + __half2 r2 = *reinterpret_cast<__half2 const*>(¶ms.srcR[g_offset]); + float2 r_f2 = __half22float2(r2); + f2.x += r_f2.x; + f2.y += r_f2.y; + *reinterpret_cast<__half2*>(¶ms.eleOut[offset]) = + __float22half2_rn(f2); + } // Normalize the channels. f2.x = (f2.x - mean) * invStdDev; f2.y = (f2.y - mean) * invStdDev; @@ -480,6 +577,19 @@ inline __device__ void GroupNormCompute( // Extract the two half values. float2 f2 = phi::bfloat1622float2(h2); + if (params.srcR != nullptr) { + auto gi = ci / params.cPerGroup; + auto gj = ci % params.cPerGroup; + int64_t g_offset = + params.y_same_with_x ? offset : gi * params.cPerGroup + gj; + __nv_bfloat162 r2 = + *reinterpret_cast<__nv_bfloat162 const*>(¶ms.srcR[g_offset]); + float2 r_f2 = phi::bfloat1622float2(r2); + f2.x += r_f2.x; + f2.y += r_f2.y; + *reinterpret_cast<__nv_bfloat162*>(¶ms.eleOut[offset]) = + phi::float22bfloat162_rn(f2); + } // Normalize the channels. f2.x = (f2.x - mean) * invStdDev; f2.y = (f2.y - mean) * invStdDev; @@ -511,6 +621,7 @@ __global__ void groupNormNDHWCScaleKernel( // The group that thread works on and the channel in the group (modulus). int32_t gi = ci / params.cPerGroup; + int32_t gj = ci % params.cPerGroup; if (ci >= params.c || gi >= params.groups) { return; @@ -597,17 +708,24 @@ template class groupNormNDHWCScale; template void GroupNormNDHWCKernel(const Context& dev_ctx, const DenseTensor& x, + const paddle::optional& residual, const paddle::optional& scale, const paddle::optional& bias, float epsilon, int groups, const std::string& data_layout_str, + const std::string& activation, DenseTensor* y, + DenseTensor* residual_out, DenseTensor* mean, DenseTensor* var) { + const DataLayout data_layout = common::StringToDataLayout(data_layout_str); + if (data_layout != DataLayout::kNHWC) { + PD_THROW("data_layout only supports NHWC and NDHWC"); + } using AccT = typename phi::dtype::MPTypeTrait::Type; GroupNormNDHWCParams params_; - params_.withSilu = false; + params_.withSilu = activation == "silu" ? true : false; const auto x_dims = x.dims(); dev_ctx.template Alloc(y); @@ -639,6 +757,23 @@ void GroupNormNDHWCKernel(const Context& dev_ctx, params_.w = x_dims[3]; } + const T* residual_data = nullptr; + const auto residual_ptr = residual.get_ptr(); + T* residual_out_data = nullptr; + if (residual_ptr) { + dev_ctx.template Alloc(residual_out); + residual_data = residual_ptr->data(); + residual_out_data = residual_out->data(); + const auto r_dims = residual_ptr->dims(); + int32_t r_dim = 1; + for (size_t i = 0; i < r_dims.size(); i++) { + r_dim *= r_dims[i]; + } + params_.y_same_with_x = + r_dim == params_.n * params_.c * params_.d * params_.h * params_.w + ? true + : false; + } dev_ctx.template Alloc(mean); dev_ctx.template Alloc(var); auto* mean_data = mean->data(); @@ -673,7 +808,10 @@ void GroupNormNDHWCKernel(const Context& dev_ctx, } params_.srcX = reinterpret_cast(x_data); params_.dst = reinterpret_cast(y_data); - + if (residual_ptr) { + params_.srcR = reinterpret_cast(residual_data); + params_.eleOut = reinterpret_cast(residual_out_data); + } params_.gamma = scale_data; params_.beta = bias_data; params_.dhw = params_.d * params_.h * params_.w; @@ -1027,14 +1165,19 @@ void GroupNormKernel(const Context& dev_ctx, DenseTensor* var) { using std::is_same; if (is_same::value && data_layout_str == "NHWC") { + const paddle::optional& residual = + paddle::optional(paddle::none); GroupNormNDHWCKernel(dev_ctx, x, + residual, scale, bias, epsilon, groups, data_layout_str, + "", y, + new DenseTensor(), mean, var); return; @@ -1042,14 +1185,19 @@ void GroupNormKernel(const Context& dev_ctx, #ifdef PADDLE_CUDA_BF16 if (is_same::value && data_layout_str == "NHWC") { + const paddle::optional& residual = + paddle::optional(paddle::none); GroupNormNDHWCKernel(dev_ctx, x, + residual, scale, bias, epsilon, groups, data_layout_str, + "", y, + new DenseTensor(), mean, var); return; @@ -1076,3 +1224,13 @@ PD_REGISTER_KERNEL(group_norm, kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); } } + +PD_REGISTER_KERNEL(add_group_norm_silu, + GPU, + ALL_LAYOUT, + phi::GroupNormNDHWCKernel, + phi::dtype::bfloat16, + phi::dtype::float16) { + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); +} diff --git a/paddle/phi/kernels/gpu/inverse_grad_kernel.cu b/paddle/phi/kernels/gpu/inverse_grad_kernel.cu index 2fdc02934fedc..15c24719adfc3 100644 --- a/paddle/phi/kernels/gpu/inverse_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/inverse_grad_kernel.cu @@ -18,5 +18,11 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/inverse_grad_kernel_impl.h" -PD_REGISTER_KERNEL( - inverse_grad, GPU, ALL_LAYOUT, phi::InverseGradKernel, float, double) {} +PD_REGISTER_KERNEL(inverse_grad, + GPU, + ALL_LAYOUT, + phi::InverseGradKernel, + float, + double, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/inverse_kernel.cu b/paddle/phi/kernels/gpu/inverse_kernel.cu index 4c011337c6f8f..a9b4fcc763b0b 100644 --- a/paddle/phi/kernels/gpu/inverse_kernel.cu +++ b/paddle/phi/kernels/gpu/inverse_kernel.cu @@ -18,5 +18,11 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/inverse_kernel_impl.h" -PD_REGISTER_KERNEL( - inverse, GPU, ALL_LAYOUT, phi::InverseKernel, float, double) {} +PD_REGISTER_KERNEL(inverse, + GPU, + ALL_LAYOUT, + phi::InverseKernel, + float, + double, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/lstm_grad_kernel.cu b/paddle/phi/kernels/gpu/lstm_grad_kernel.cu new file mode 100644 index 0000000000000..5590541dcb385 --- /dev/null +++ b/paddle/phi/kernels/gpu/lstm_grad_kernel.cu @@ -0,0 +1,19 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/impl/lstm_kernel_impl.h" +#include "paddle/phi/kernels/lstm_kernel.h" + +PD_REGISTER_KERNEL( + lstm_grad, GPU, ALL_LAYOUT, phi::LSTMGradKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/lstm_kernel.cu b/paddle/phi/kernels/gpu/lstm_kernel.cu new file mode 100644 index 0000000000000..7bcf1f78ab604 --- /dev/null +++ b/paddle/phi/kernels/gpu/lstm_kernel.cu @@ -0,0 +1,18 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/lstm_kernel.h" +#include "paddle/phi/kernels/impl/lstm_kernel_impl.h" + +PD_REGISTER_KERNEL(lstm, GPU, ALL_LAYOUT, phi::LSTMKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/meshgrid_grad_kernel.cu.cc b/paddle/phi/kernels/gpu/meshgrid_grad_kernel.cu.cc index 2dd9e7dc6ceec..3244f28c77700 100644 --- a/paddle/phi/kernels/gpu/meshgrid_grad_kernel.cu.cc +++ b/paddle/phi/kernels/gpu/meshgrid_grad_kernel.cu.cc @@ -27,4 +27,6 @@ PD_REGISTER_KERNEL(meshgrid_grad, double, int, int64_t, - phi::dtype::bfloat16) {} + phi::dtype::bfloat16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/meshgrid_kernel.cu.cc b/paddle/phi/kernels/gpu/meshgrid_kernel.cu.cc index 5a1c74f4193d3..9176305d94fec 100644 --- a/paddle/phi/kernels/gpu/meshgrid_kernel.cu.cc +++ b/paddle/phi/kernels/gpu/meshgrid_kernel.cu.cc @@ -27,4 +27,6 @@ PD_REGISTER_KERNEL(meshgrid, double, int, int64_t, - phi::dtype::bfloat16) {} + phi::dtype::bfloat16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/group_norm_kernel.h b/paddle/phi/kernels/group_norm_kernel.h index 3dc10df6a1109..7f4b83f065bde 100644 --- a/paddle/phi/kernels/group_norm_kernel.h +++ b/paddle/phi/kernels/group_norm_kernel.h @@ -67,6 +67,8 @@ struct GroupNormNDHWCParams { T const* srcX; // The input buffer. Layout NDHWC. T const* srcY; + // The input buffer. Layout NDHWC. + T const* srcR = nullptr; // The gamma scaling factor. void const* gamma; // The beta term to add in GN. @@ -87,7 +89,8 @@ struct GroupNormNDHWCParams { int32_t groups; // Do we apply the Silu activation function? bool withSilu; - + // + bool y_same_with_x = false; // Precomputed values and parameters to control the execution of the kernels. // The number of activations per instance (d * h * w) and the number of diff --git a/paddle/phi/kernels/impl/inverse_grad_kernel_impl.h b/paddle/phi/kernels/impl/inverse_grad_kernel_impl.h index 26e2898bf73ff..aa23bddb5b979 100644 --- a/paddle/phi/kernels/impl/inverse_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/inverse_grad_kernel_impl.h @@ -18,6 +18,7 @@ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/kernels/complex_kernel.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/matrix_inverse.h" @@ -37,15 +38,35 @@ void InverseGradKernel(const Context& dev_ctx, tmp_out.Resize(out.dims()); dev_ctx.template Alloc(&tmp_out); - auto mat_dim_a0 = - phi::funcs::CreateMatrixDescriptor(out_grad.dims(), 0, false); - auto mat_dim_b0 = phi::funcs::CreateMatrixDescriptor(out.dims(), 0, true); - blas.MatMul(out_grad, mat_dim_a0, out, mat_dim_b0, T(1), &tmp_out, T(0)); + if (IsComplexType(out.dtype())) { + DenseTensor out_conj; + out_conj.Resize(out.dims()); + dev_ctx.template Alloc(&out_conj); - auto mat_dim_a1 = phi::funcs::CreateMatrixDescriptor(out.dims(), 0, true); - auto mat_dim_b1 = - phi::funcs::CreateMatrixDescriptor(tmp_out.dims(), 0, false); - blas.MatMul(out, mat_dim_a1, tmp_out, mat_dim_b1, T(-1), in_grad, T(0)); + phi::ConjKernel(dev_ctx, out, &out_conj); + + auto mat_dim_a0 = + phi::funcs::CreateMatrixDescriptor(out_grad.dims(), 0, false); + auto mat_dim_b0 = phi::funcs::CreateMatrixDescriptor(out.dims(), 0, true); + blas.MatMul( + out_grad, mat_dim_a0, out_conj, mat_dim_b0, T(1), &tmp_out, T(0)); + + auto mat_dim_a1 = phi::funcs::CreateMatrixDescriptor(out.dims(), 0, true); + auto mat_dim_b1 = + phi::funcs::CreateMatrixDescriptor(tmp_out.dims(), 0, false); + blas.MatMul( + out_conj, mat_dim_a1, tmp_out, mat_dim_b1, T(-1), in_grad, T(0)); + } else { + auto mat_dim_a0 = + phi::funcs::CreateMatrixDescriptor(out_grad.dims(), 0, false); + auto mat_dim_b0 = phi::funcs::CreateMatrixDescriptor(out.dims(), 0, true); + blas.MatMul(out_grad, mat_dim_a0, out, mat_dim_b0, T(1), &tmp_out, T(0)); + + auto mat_dim_a1 = phi::funcs::CreateMatrixDescriptor(out.dims(), 0, true); + auto mat_dim_b1 = + phi::funcs::CreateMatrixDescriptor(tmp_out.dims(), 0, false); + blas.MatMul(out, mat_dim_a1, tmp_out, mat_dim_b1, T(-1), in_grad, T(0)); + } } } diff --git a/paddle/phi/kernels/impl/lstm_kernel_impl.h b/paddle/phi/kernels/impl/lstm_kernel_impl.h new file mode 100644 index 0000000000000..1f4b4dcac0f14 --- /dev/null +++ b/paddle/phi/kernels/impl/lstm_kernel_impl.h @@ -0,0 +1,443 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/detail/activation_functions.h" +#include "paddle/phi/kernels/funcs/lstm_compute.h" +#include "paddle/phi/kernels/funcs/lstm_utils.h" + +namespace phi { + +template +void LSTMKernel(const Context& dev_ctx, + const DenseTensor& input, + const paddle::optional& h0, + const paddle::optional& c0, + const DenseTensor& weight, + const DenseTensor& bias, + bool use_peepholes, + bool is_reverse, + bool is_test, + const std::string& gate_activation, + const std::string& cell_activation, + const std::string& candidate_activation, + DenseTensor* hidden, + DenseTensor* cell, + DenseTensor* batch_gate, + DenseTensor* batch_cell_pre_act) { + auto* hidden_t0 = h0.get_ptr(); + auto* cell_t0 = c0.get_ptr(); + + phi::DenseTensor* batch_gate_new = nullptr; + phi::DenseTensor batch_gate_temp; + if (is_test) { + batch_gate_new = &batch_gate_temp; + batch_gate_new->Resize(input.dims()); + } else { + batch_gate_new = batch_gate; + } + + dev_ctx.template Alloc(batch_gate_new); + dev_ctx.template Alloc(hidden); + dev_ctx.template Alloc(cell); + + phi::funcs::LoDTensor2BatchFunctor to_batch; + to_batch(dev_ctx, input, batch_gate_new, true, is_reverse); + + auto in_dims = input.dims(); + int frame_size = static_cast(in_dims[1] / 4); + phi::DDim dims({in_dims[0], frame_size}); + + if (bias.initialized()) { + phi::DenseTensor b = bias; + b.Resize({bias.numel(), 1}); + phi::DenseTensor gate_bias = b.Slice(0, 4 * frame_size); + phi::funcs::RowwiseAdd add_bias; + add_bias(dev_ctx, *batch_gate_new, gate_bias, batch_gate_new); + } + + phi::funcs::LstmMetaValue lstm_value; + if (bias.initialized() && use_peepholes) { + T* bias_data = const_cast(bias.data()); + // the code style in LstmMetaValue will be updated later. + + lstm_value.check_ig = bias_data + 4 * frame_size; + lstm_value.check_fg = lstm_value.check_ig + frame_size; + lstm_value.check_og = lstm_value.check_fg + frame_size; + } else { + lstm_value.check_ig = nullptr; + lstm_value.check_fg = nullptr; + lstm_value.check_og = nullptr; + } + lstm_value.prev_state_value = nullptr; + phi::DenseTensor ordered_c0; + + phi::Vector order(batch_gate_new->lod()[2]); + + if (cell_t0) { + // Since the batch computing for LSTM reorders the input sequence + // according to their length. The initialized cell state also needs + // to reorder. + ReorderInitState(dev_ctx, *cell_t0, order, &ordered_c0, true); + lstm_value.prev_state_value = ordered_c0.data(); + } + + // Use the local variable as here. + phi::DenseTensor batch_hidden, batch_cell, batch_cell_pre_act_temp; + phi::DenseTensor* batch_cell_pre_act_p; + if (is_test) { + batch_cell_pre_act_p = &batch_cell_pre_act_temp; + } else { + batch_cell_pre_act_p = batch_cell_pre_act; + } + batch_hidden.Resize(dims); + batch_cell.Resize(dims); + dev_ctx.template Alloc(&batch_hidden); + dev_ctx.template Alloc(&batch_cell); + batch_cell_pre_act_p->Resize(dims); + dev_ctx.template Alloc(batch_cell_pre_act_p); + + auto batch_starts = batch_gate_new->lod()[0]; + size_t num_batch = batch_starts.size() - 1; + auto gate_act = phi::funcs::detail::GetActivationType(gate_activation); + auto cell_act = phi::funcs::detail::GetActivationType(cell_activation); + auto cand_act = phi::funcs::detail::GetActivationType(candidate_activation); + + auto blas = phi::funcs::GetBlas(dev_ctx); + for (size_t n = 0; n < num_batch; n++) { + int bstart = static_cast(batch_starts[n]); + int bend = static_cast(batch_starts[n + 1]); + + phi::DenseTensor gate_t = batch_gate_new->Slice(bstart, bend); + phi::DenseTensor out_t = batch_hidden.Slice(bstart, bend); + phi::DenseTensor cell_t = batch_cell.Slice(bstart, bend); + phi::DenseTensor cell_pre_act_t = batch_cell_pre_act_p->Slice(bstart, bend); + + int cur_batch_size = bend - bstart; + + if (n > 0) { + int pre_h_start = static_cast(batch_starts[n - 1]); + int pre_h_end = pre_h_start + cur_batch_size; + auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end); + blas.MatMul(pre_hidden_t, + false, + weight, + false, + static_cast(1.0), + &gate_t, + static_cast(1.0)); + } else if (hidden_t0 != nullptr) { + // If n == 0 and there is no initialized hidden state, that is to say + // the H0 is zeros, the calculation W_h * H0 will be skiped. + // If n == 0 and there is initialized hidden state, calculate W_h * H0. + + // Since the batch computing for LSTM reorders the input sequence + // according to their length. The initialized hidden state also needs + // to reorder. + phi::DenseTensor ordered_h0; + ReorderInitState( + dev_ctx, *hidden_t0, order, &ordered_h0, true); + blas.MatMul(ordered_h0, + false, + weight, + false, + static_cast(1.0), + &gate_t, + static_cast(1.0)); + } + + lstm_value.gate_value = gate_t.data(); + lstm_value.output_value = out_t.data(); + lstm_value.state_value = cell_t.data(); + lstm_value.state_active_value = cell_pre_act_t.data(); + T cell_clip = 0.0; + phi::funcs::LstmUnitFunctor::compute(dev_ctx, + lstm_value, + frame_size, + cur_batch_size, + cell_clip, + gate_act, + cell_act, + cand_act); + lstm_value.prev_state_value = lstm_value.state_value; + } + + phi::funcs::Batch2LoDTensorFunctor to_seq; + batch_hidden.set_lod(batch_gate_new->lod()); + // restore the output hidden in phi::DenseTensor from the batch hidden + to_seq(dev_ctx, batch_hidden, hidden); + + batch_cell.set_lod(batch_gate_new->lod()); + // restore the output cell state in phi::DenseTensor from the batch cell + to_seq(dev_ctx, batch_cell, cell); +} + +template +void LSTMGradKernel(const Context& dev_ctx, + const DenseTensor& input_in, + const paddle::optional& h0_in, + const paddle::optional& c0_in, + const DenseTensor& weight_in, + const DenseTensor& bias_in, + const DenseTensor& hidden_in, + const DenseTensor& cell_in, + const DenseTensor& batch_gate_in, + const DenseTensor& batch_cell_pre_act_in, + const DenseTensor& hidden_grad, + bool use_peepholes, + bool is_reverse, + bool is_test, + const std::string& gate_activation, + const std::string& cell_activation, + const std::string& candidate_activation, + DenseTensor* input_grad, + DenseTensor* h0_grad, + DenseTensor* c0_grad, + DenseTensor* weight_grad, + DenseTensor* bias_grad) { + auto* input = &input_in; + auto* weight = &weight_in; + auto* bias = &bias_in; + + auto* hidden_out = &hidden_in; + auto* cell_out = &cell_in; + + auto* batch_gate = &batch_gate_in; + auto* batch_cell_pre_act = &batch_cell_pre_act_in; + + auto* hidden_g = &hidden_grad; + + auto* in_g = input_grad; + auto* weight_g = weight_grad; + auto* bias_g = bias_grad; + + auto* h0 = h0_in.get_ptr(); + auto* c0 = c0_in.get_ptr(); + + auto* h0_g = h0_grad; + auto* c0_g = c0_grad; + + phi::funcs::SetConstant zero; + if (weight_g) { + dev_ctx.template Alloc(weight_g); + zero(dev_ctx, weight_g, static_cast(0.0)); + } + + // ordered_h0/c0 is the reordered hidden/cell initialization. + // ordered_h0_g/c0_g is the reordered gradient of hidden/cell + // initialization. + phi::DenseTensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g; + phi::Vector order(batch_gate->lod()[2]); + + if (c0) { + ReorderInitState(dev_ctx, *c0, order, &ordered_c0, true); + } + if (c0 && c0_g) { + ordered_c0_g.Resize(c0_g->dims()); + dev_ctx.template Alloc(&ordered_c0_g); + } + + auto in_dims = input->dims(); + auto out_dims = hidden_g->dims(); + int frame_size = static_cast(in_dims[1] / 4); + PADDLE_ENFORCE_EQ(frame_size, + out_dims[1], + phi::errors::InvalidArgument( + "The second dimension of Input(hidden_grad) should be " + "%d, but received %d in LSTM@Grad operator.", + frame_size, + out_dims[1])); + + phi::funcs::LstmMetaValue lstm_value; + if (bias && use_peepholes) { + T* bias_data = const_cast(bias->data()); + lstm_value.check_ig = bias_data + 4 * frame_size; + lstm_value.check_fg = lstm_value.check_ig + frame_size; + lstm_value.check_og = lstm_value.check_fg + frame_size; + } else { + lstm_value.check_ig = nullptr; + lstm_value.check_fg = nullptr; + lstm_value.check_og = nullptr; + } + + phi::funcs::LstmMetaGrad lstm_grad; + + if (bias && bias_g) { + dev_ctx.template Alloc(bias_g); + zero(dev_ctx, bias_g, static_cast(0.0)); + } + if (bias && bias_g && use_peepholes) { + T* bias_g_data = bias_g->data(); + lstm_grad.check_ig_grad = bias_g_data + 4 * frame_size; + lstm_grad.check_fg_grad = lstm_grad.check_ig_grad + frame_size; + lstm_grad.check_og_grad = lstm_grad.check_fg_grad + frame_size; + } else { + lstm_grad.check_ig_grad = nullptr; + lstm_grad.check_fg_grad = nullptr; + lstm_grad.check_og_grad = nullptr; + } + + phi::funcs::LoDTensor2BatchFunctor to_batch; + + auto ToBatch = [&batch_gate, &to_batch](const Context& ctx, + const phi::DenseTensor& src, + const phi::DDim& dims, + phi::DenseTensor& dst) { + dst.Resize(dims); + ctx.template Alloc(&dst); + dst.set_lod(batch_gate->lod()); + to_batch(ctx, src, &dst, false); + }; + + phi::DenseTensor batch_hidden, batch_hidden_g, batch_cell; + ToBatch(dev_ctx, *hidden_out, out_dims, batch_hidden); + ToBatch(dev_ctx, *hidden_g, out_dims, batch_hidden_g); + ToBatch(dev_ctx, *cell_out, out_dims, batch_cell); + + phi::DenseTensor batch_cell_g, batch_gate_g; + batch_cell_g.Resize(out_dims); + dev_ctx.template Alloc(&batch_cell_g); + // TODO(qingqing) support the case output cell has gradient. + // to_batch(dev_ctx, *cell_g, batch_cell_g, false); + zero(dev_ctx, &batch_cell_g, static_cast(0.0)); + batch_gate_g.Resize(batch_gate->dims()); + dev_ctx.template Alloc(&batch_gate_g); + batch_gate_g.set_lod(batch_gate->lod()); + + auto gate_act = phi::funcs::detail::GetActivationType(gate_activation); + auto cell_act = phi::funcs::detail::GetActivationType(cell_activation); + auto cand_act = phi::funcs::detail::GetActivationType(candidate_activation); + + auto batch_starts = batch_gate->lod()[0]; + size_t num_batch = batch_starts.size() - 1; + auto blas = phi::funcs::GetBlas(dev_ctx); + for (int n = static_cast(num_batch) - 1; n >= 0; n--) { + int bstart = static_cast(batch_starts[n]); + int bend = static_cast(batch_starts[n + 1]); + + phi::DenseTensor gate = batch_gate->Slice(bstart, bend); + phi::DenseTensor cell = batch_cell.Slice(bstart, bend); + phi::DenseTensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend); + lstm_value.gate_value = gate.data(); + lstm_value.state_value = cell.data(); + lstm_value.state_active_value = cell_pre_act.data(); + + phi::DenseTensor out_g = batch_hidden_g.Slice(bstart, bend); + phi::DenseTensor gate_g = batch_gate_g.Slice(bstart, bend); + phi::DenseTensor cell_g = batch_cell_g.Slice(bstart, bend); + lstm_grad.state_grad = cell_g.data(); + lstm_grad.gate_grad = gate_g.data(); + lstm_grad.output_grad = out_g.data(); + + if (n > 0) { + int bstart_pre = static_cast(batch_starts[n - 1]); + phi::DenseTensor cell_pre = batch_cell.Slice(bstart_pre, bstart); + phi::DenseTensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart); + lstm_value.prev_state_value = cell_pre.data(); + lstm_grad.prev_state_grad = cell_pre_g.data(); + } else { + lstm_value.prev_state_value = c0 ? ordered_c0.data() : nullptr; + lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data() : nullptr; + } + + // lstm_value.output_value not used in bp, set to nullptr + // lstm_grad.state_active_grad not used in bp, set to nullptr + lstm_value.output_value = nullptr; + lstm_grad.state_active_grad = nullptr; + int cur_batch_size = bend - bstart; + T cell_clip = 0.0; + phi::funcs::LstmUnitGradFunctor::compute(dev_ctx, + lstm_value, + lstm_grad, + frame_size, + cur_batch_size, + cell_clip, + gate_act, + cell_act, + cand_act); + + if (n > 0) { + int pre_h_start = static_cast(batch_starts[n - 1]); + int pre_h_end = pre_h_start + cur_batch_size; + auto pre_hidden_g = batch_hidden_g.Slice(pre_h_start, pre_h_end); + blas.MatMul(gate_g, + false, + *weight, + true, + static_cast(1.0), + &pre_hidden_g, + static_cast(1.0)); + if (weight_g) { + /* backward weight */ + auto pre_hidden = batch_hidden.Slice(pre_h_start, pre_h_end); + blas.MatMul(pre_hidden, + true, + gate_g, + false, + static_cast(1.0), + weight_g, + static_cast(1.0)); + } + } else { + if (h0 && weight_g) { + ReorderInitState(dev_ctx, *h0, order, &ordered_h0, true); + blas.MatMul(ordered_h0, + true, + gate_g, + false, + static_cast(1.0), + weight_g, + static_cast(1.0)); + } + if (h0 && h0_g) { + ordered_h0_g.Resize(h0_g->dims()); + dev_ctx.template Alloc(&ordered_h0_g); + blas.MatMul(gate_g, + false, + *weight, + true, + static_cast(1.0), + &ordered_h0_g, + static_cast(0.0)); + } + } + } + + phi::funcs::Batch2LoDTensorFunctor to_seq; + if (in_g) { + /* backward data */ + dev_ctx.template Alloc(in_g); + to_seq(dev_ctx, batch_gate_g, in_g); + } + if (bias && bias_g) { + /* backward bias */ + phi::DenseTensor b_g = *bias_g; + b_g.Resize({bias_g->numel(), 1}); + phi::DenseTensor gate_bias_g = b_g.Slice(0, 4 * frame_size); + phi::funcs::ColwiseSum col_sum; + col_sum(dev_ctx, batch_gate_g, &gate_bias_g); + } + + if (h0 && h0_g) { + ReorderInitState(dev_ctx, ordered_h0_g, order, h0_g, false); + } + if (c0 && c0_g) { + ReorderInitState(dev_ctx, ordered_c0_g, order, c0_g, false); + } +} +} // namespace phi diff --git a/paddle/phi/kernels/lstm_kernel.h b/paddle/phi/kernels/lstm_kernel.h new file mode 100644 index 0000000000000..42195e375c3a9 --- /dev/null +++ b/paddle/phi/kernels/lstm_kernel.h @@ -0,0 +1,66 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" +#include "paddle/utils/optional.h" + +namespace phi { + +template +void LSTMKernel(const Context& dev_ctx, + const DenseTensor& input, + const paddle::optional& h0, + const paddle::optional& c0, + const DenseTensor& weight, + const DenseTensor& bias, + bool use_peepholes, + bool is_reverse, + bool is_test, + const std::string& gate_activation, + const std::string& cell_activation, + const std::string& candidate_activation, + DenseTensor* hidden, + DenseTensor* cell, + DenseTensor* batch_gate, + DenseTensor* batch_cell_pre_act); + +template +void LSTMGradKernel(const Context& dev_ctx, + const DenseTensor& input, + const paddle::optional& h0, + const paddle::optional& c0, + const DenseTensor& weight, + const DenseTensor& bias, + const DenseTensor& hidden, + const DenseTensor& cell, + const DenseTensor& batch_gate, + const DenseTensor& batch_cell_pre_act, + const DenseTensor& hidden_grad, + bool use_peepholes, + bool is_reverse, + bool is_test, + const std::string& gate_activation, + const std::string& cell_activation, + const std::string& candidate_activation, + DenseTensor* input_grad, + DenseTensor* h0_grad, + DenseTensor* c0_grad, + DenseTensor* weight_grad, + DenseTensor* bias_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/reduce_kernel_impl.cc b/paddle/phi/kernels/reduce_kernel_impl.cc index 000cb99034c26..9319248099903 100644 --- a/paddle/phi/kernels/reduce_kernel_impl.cc +++ b/paddle/phi/kernels/reduce_kernel_impl.cc @@ -20,10 +20,16 @@ namespace phi { // oneDNN's reduction kernel is optimized only for reducing throughout the // most outer dims, so in case of another type of reduction, it would be // better to fallback to native implementation -inline bool HasOptimizedOneDNNKernel(const KernelContext* ctx) { +inline bool HasOptimizedOneDNNKernel(const KernelContext* ctx, + const bool mean_op) { const DenseTensor& x = ctx->InputAt(0); - const TensorRef& dims_tmp = ctx->AttrAt(0); - IntArray dims_array = IntArray(*dims_tmp.Get()); + IntArray dims_array; + if (mean_op) { + dims_array = ctx->AttrAt(0); + } else { + const TensorRef& dims_tmp = ctx->AttrAt(0); + dims_array = IntArray(*dims_tmp.Get()); + } int ndims = x.dims().size(); const bool reduce_all = recompute_reduce_all(x, dims_array); auto dims = dims_array.GetData(); @@ -53,7 +59,15 @@ inline bool HasOptimizedOneDNNKernel(const KernelContext* ctx) { bool ReduceCheckIfOneDNNSupport(const KernelContext* ctx) { if (ctx->InputAt(0).dims().size() > 5 || - !HasOptimizedOneDNNKernel(ctx)) { + !HasOptimizedOneDNNKernel(ctx, false)) { + return false; + } + return true; +} + +bool ReduceMeanCheckIfOneDNNSupport(const KernelContext* ctx) { + if (ctx->InputAt(0).dims().size() > 5 || + !HasOptimizedOneDNNKernel(ctx, true)) { return false; } return true; diff --git a/paddle/phi/kernels/reduce_kernel_impl.h b/paddle/phi/kernels/reduce_kernel_impl.h index aef4f57ddbdcf..e117f6ab335dd 100644 --- a/paddle/phi/kernels/reduce_kernel_impl.h +++ b/paddle/phi/kernels/reduce_kernel_impl.h @@ -21,4 +21,6 @@ bool ReduceCheckIfOneDNNSupport(const KernelContext* ctx); bool ReduceGradCheckIfOneDNNSupport(const KernelContext* ctx); +bool ReduceMeanCheckIfOneDNNSupport(const KernelContext* ctx); + } // namespace phi diff --git a/paddle/phi/kernels/reduce_mean_kernel.cc b/paddle/phi/kernels/reduce_mean_kernel.cc index 16b3abf0e2931..a657e7ba8c01d 100644 --- a/paddle/phi/kernels/reduce_mean_kernel.cc +++ b/paddle/phi/kernels/reduce_mean_kernel.cc @@ -67,7 +67,7 @@ PD_REGISTER_KERNEL(mean, KPS, ALL_LAYOUT, phi::MeanKernel, float) {} #if defined(PADDLE_WITH_DNNL) PD_REGISTER_KERNEL( mean, OneDNN, ONEDNN, phi::MeanKernel, float, phi::dtype::bfloat16) { - kernel->check_if_onednn_kernel_support_ = phi::ReduceCheckIfOneDNNSupport; + kernel->check_if_onednn_kernel_support_ = phi::ReduceMeanCheckIfOneDNNSupport; } #endif diff --git a/paddle/phi/kernels/sparse/gpu/conv_kernel_impl.cuh b/paddle/phi/kernels/sparse/gpu/conv_kernel_impl.cuh index 775c23def14b0..3b6de498ef5b5 100644 --- a/paddle/phi/kernels/sparse/gpu/conv_kernel_impl.cuh +++ b/paddle/phi/kernels/sparse/gpu/conv_kernel_impl.cuh @@ -566,7 +566,7 @@ __global__ void __launch_bounds__(128) conv_forward_cuda_setting3_mode0_f16f16f3 // conv_forward_cuda_m128n16k16_f32f32f32 template -__global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode0_f32f32f32(int M, int K_original, int N, int kernel_volume, float* __restrict__ A, float* __restrict__ B, int* __restrict__ out_in_map, float* __restrict__ C) +__global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode0_f32f32f32(int M, int K_original, int N, int kernel_volume, float* __restrict__ A, float* __restrict__ B, int* __restrict__ out_in_map, float* __restrict__ C) { const int K_tile = 16; @@ -578,27 +578,27 @@ __global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode0_f32f32f32 __shared__ float B_shared[256]; #pragma unroll - for (int i = 0; i < 32; ++i) + for (int i = 0; i < 32; ++i) { C_local[i] = 0.0; } - + int K_loops = K_implicit / 16; - int block_num_n = (N - 1) / 16 + 1; + int block_num_n = (N - 1) / 16 + 1; int blockIdx_m = (int)blockIdx.x / block_num_n; int blockIdx_n = (int)blockIdx.x % block_num_n; int threadIdx_x = (int)threadIdx.x; // hoisting shared pointer offsets - int * out_in_map_ptr = out_in_map - + (blockIdx_m * 128 + (threadIdx_x / (16/4)))* kernel_volume; + int * out_in_map_ptr = out_in_map + + (blockIdx_m * 128 + (threadIdx_x / (16/4)))* kernel_volume; - float * B_ptr = B - + (threadIdx_x / (16/4)) * N - + (blockIdx_n * 16) + ((threadIdx_x * 4) % 16); + float * B_ptr = B + + (threadIdx_x / (16/4)) * N + + (blockIdx_n * 16) + ((threadIdx_x * 4) % 16); float * A_shared_ptr = A_shared + (threadIdx_x * 4); - float * A_shared_reduce_ptr = A_shared + ((threadIdx_x / 4) * 16); + float * A_shared_reduce_ptr = A_shared + ((threadIdx_x / 4) * 16); float * B_shared_ptr = B_shared + (threadIdx_x * 4); float * B_shared_reduce_ptr = B_shared + (threadIdx_x % 4); @@ -648,7 +648,7 @@ __global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode0_f32f32f32 } int* out_in_map_ptr_local = out_in_map_ptr + k_0 * 16 / K_tile_padded; - float* A_ptr_local = A + (k_0 * 16 % K_tile_padded) + channel_offset_A; + float* A_ptr_local = A + (k_0 * 16 % K_tile_padded) + channel_offset_A; float* B_ptr_local; if constexpr (K_ld_check) @@ -661,14 +661,14 @@ __global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode0_f32f32f32 for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0) { - int input_idx = *(out_in_map_ptr_local + (ax0_ax1_fused_0 *16) * kernel_volume); + int input_idx = *(out_in_map_ptr_local + (ax0_ax1_fused_0 *16) * kernel_volume); if (input_idx != -1) { uint4 A_loaded = make_uint4(0, 0, 0, 0); global_load(A_loaded, A_ptr_local + (input_idx * K_original) , A_pred_guard); *(uint4 *)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) = A_loaded; } - else + else { *(uint4*)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) = make_uint4(0, 0, 0, 0); } @@ -678,23 +678,23 @@ __global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode0_f32f32f32 for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 1; ++ax0_ax1_fused_0_1) { uint4 B_loaded = make_uint4(0, 0, 0, 0); - global_load(B_loaded, B_ptr_local + (ax0_ax1_fused_0_1 * 16) * N, B_pred_guard); + global_load(B_loaded, B_ptr_local + (ax0_ax1_fused_0_1 * 16) * N, B_pred_guard); *(uint4 *)(B_shared_ptr + (ax0_ax1_fused_0_1 * 256)) = B_loaded; } __syncthreads(); #pragma unroll - for (int k_1 = 0; k_1 < ( 16 / 4); ++k_1) + for (int k_1 = 0; k_1 < ( 16 / 4); ++k_1) { #pragma unroll - for (int k_2 = 0; k_2 < 4; ++k_2) + for (int k_2 = 0; k_2 < 4; ++k_2) { int vk_in_block = (k_1 << 2) + k_2; #pragma unroll - for (int i = 0; i < 32; ++i) + for (int i = 0; i < 32; ++i) { - C_local[i] = C_local[i] + - A_shared_reduce_ptr[((i / 4) * 16) * 16 + vk_in_block] + C_local[i] = C_local[i] + + A_shared_reduce_ptr[((i / 4) * 16) * 16 + vk_in_block] * B_shared_reduce_ptr[(vk_in_block * 16) + ((i % 4) * 4)]; } @@ -707,7 +707,7 @@ __global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode0_f32f32f32 for (int i = 0; i < 32; ++i) { int location_cur = location_offset + ((i / 4) * 16); - int vn = C_n_offset + ((i % 4) * 4); + int vn = C_n_offset + ((i % 4) * 4); if constexpr (N_ld_check) { @@ -723,34 +723,34 @@ __global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode0_f32f32f32 } // conv_forward_cuda_m128n16k32_f32f32f32 -__global__ void __launch_bounds__(64) conv_forward_cuda_setting2_mode0_f32f32f32(int M, int K_original, int N, int kernel_volume, float* __restrict__ A, float* __restrict__ B, int* __restrict__ out_in_map, float* __restrict__ C) +__global__ void __launch_bounds__(64) conv_forward_cuda_setting2_mode0_f32f32f32(int M, int K_original, int N, int kernel_volume, float* __restrict__ A, float* __restrict__ B, int* __restrict__ out_in_map, float* __restrict__ C) { float C_local[32]; __shared__ float A_shared[4096]; __shared__ float B_shared[512]; #pragma unroll - for (int i = 0; i < 32; ++i) + for (int i = 0; i < 32; ++i) { C_local[i] = 0.0; } - + int K_loops = (K_original * kernel_volume - 1) / 32 + 1; - int block_num_n = (N - 1) / 16 + 1; + int block_num_n = (N - 1) / 16 + 1; int blockIdx_m = (int)blockIdx.x / block_num_n; int blockIdx_n = (int)blockIdx.x % block_num_n; int threadIdx_x = (int)threadIdx.x; // hoisting shared pointer offsets - int * out_in_map_ptr = out_in_map - + (blockIdx_m * 128 + (threadIdx_x / (32/4)))* kernel_volume; + int * out_in_map_ptr = out_in_map + + (blockIdx_m * 128 + (threadIdx_x / (32/4)))* kernel_volume; - float * B_ptr = B - + (threadIdx_x / (16/4)) * N - + (blockIdx_n * 16) + ((threadIdx_x * 4) % 16); + float * B_ptr = B + + (threadIdx_x / (16/4)) * N + + (blockIdx_n * 16) + ((threadIdx_x * 4) % 16); float * A_shared_ptr = A_shared + (threadIdx_x * 4); - float * A_shared_reduce_ptr = A_shared + ((threadIdx_x / 4) * 32); + float * A_shared_reduce_ptr = A_shared + ((threadIdx_x / 4) * 32); float * B_shared_ptr = B_shared + (threadIdx_x * 4); float * B_shared_reduce_ptr = B_shared + (threadIdx_x % 4); @@ -762,7 +762,7 @@ __global__ void __launch_bounds__(64) conv_forward_cuda_setting2_mode0_f32f32f32 #pragma unroll for (int k_0 = 0; k_0 < K_loops; ++k_0) { - int channel_offset = k_0 % (K_original / 32) * 32 + channel_offset_A; + int channel_offset = k_0 % (K_original / 32) * 32 + channel_offset_A; int kernel_offset = k_0 / (K_original / 32); int *out_in_map_ptr_k = out_in_map_ptr + kernel_offset; @@ -772,8 +772,8 @@ __global__ void __launch_bounds__(64) conv_forward_cuda_setting2_mode0_f32f32f32 for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 16; ++ax0_ax1_fused_0) { - int input_idx = *(out_in_map_ptr_k + (ax0_ax1_fused_0 *8) * kernel_volume); - if (input_idx != -1) + int input_idx = *(out_in_map_ptr_k + (ax0_ax1_fused_0 *8) * kernel_volume); + if (input_idx != -1) { *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) = // ax0_ax1_fused_0 * elements loaded in each loop @@ -788,27 +788,27 @@ __global__ void __launch_bounds__(64) conv_forward_cuda_setting2_mode0_f32f32f32 } #pragma unroll - for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 2; ++ax0_ax1_fused_0_1) + for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 2; ++ax0_ax1_fused_0_1) { *(float4*)(B_shared_ptr + (ax0_ax1_fused_0_1 * 256)) = // ax0_ax1_fused_0_1 * elements loaded in each loop - *(float4*)(B_ptr + ((k_0 * 32) + (ax0_ax1_fused_0_1 * 16)) * N); + *(float4*)(B_ptr + ((k_0 * 32) + (ax0_ax1_fused_0_1 * 16)) * N); } __syncthreads(); #pragma unroll - for (int k_1 = 0; k_1 < ( 32 / 4); ++k_1) + for (int k_1 = 0; k_1 < ( 32 / 4); ++k_1) { #pragma unroll - for (int k_2 = 0; k_2 < 4; ++k_2) + for (int k_2 = 0; k_2 < 4; ++k_2) { int vk_in_block = (k_1 << 2) + k_2; #pragma unroll - for (int i = 0; i < 32; ++i) + for (int i = 0; i < 32; ++i) { - C_local[i] = C_local[i] + - A_shared_reduce_ptr[((i / 4) * 16) * 32 + vk_in_block] + C_local[i] = C_local[i] + + A_shared_reduce_ptr[((i / 4) * 16) * 32 + vk_in_block] * B_shared_reduce_ptr[(vk_in_block * 16) + ((i % 4) * 4)]; } @@ -818,44 +818,44 @@ __global__ void __launch_bounds__(64) conv_forward_cuda_setting2_mode0_f32f32f32 } #pragma unroll - for (int i = 0; i < 32; ++i) + for (int i = 0; i < 32; ++i) { int location_cur = location_offset + ((i / 4) * 16); - int vn = C_n_offset + ((i % 4) * 4); + int vn = C_n_offset + ((i % 4) * 4); if (location_cur < M) C[location_cur * N + vn] = C_local[i]; } } // conv_forward_cuda_m128n64k32_f32f32f32 -__global__ void __launch_bounds__(128) conv_forward_cuda_setting3_mode0_f32f32f32(int M, int K_original, int N, int kernel_volume, float* __restrict__ A, float* __restrict__ B, int* __restrict__ out_in_map, float* __restrict__ C) +__global__ void __launch_bounds__(128) conv_forward_cuda_setting3_mode0_f32f32f32(int M, int K_original, int N, int kernel_volume, float* __restrict__ A, float* __restrict__ B, int* __restrict__ out_in_map, float* __restrict__ C) { float C_local[64]; __shared__ float A_shared[4096]; __shared__ float B_shared[2048]; #pragma unroll - for (int i = 0; i < 64; ++i) + for (int i = 0; i < 64; ++i) { C_local[i] = 0.0; } - + int K_loops = (K_original * kernel_volume - 1) / 32 + 1; - int block_num_n = (N - 1) / 64 + 1; + int block_num_n = (N - 1) / 64 + 1; int blockIdx_m = (int)blockIdx.x / block_num_n; int blockIdx_n = (int)blockIdx.x % block_num_n; int threadIdx_x = (int)threadIdx.x; // hoisting shared pointer offsets - int * out_in_map_ptr = out_in_map - + (blockIdx_m * 128 + (threadIdx_x / (32/4)))* kernel_volume; + int * out_in_map_ptr = out_in_map + + (blockIdx_m * 128 + (threadIdx_x / (32/4)))* kernel_volume; - float * B_ptr = B - + (threadIdx_x / (64/4)) * N - + (blockIdx_n * 64) + ((threadIdx_x * 4) % 64); + float * B_ptr = B + + (threadIdx_x / (64/4)) * N + + (blockIdx_n * 64) + ((threadIdx_x * 4) % 64); float * A_shared_ptr = A_shared + (threadIdx_x * 4); - float * A_shared_reduce_ptr = A_shared + ((threadIdx_x / 16) * 32); + float * A_shared_reduce_ptr = A_shared + ((threadIdx_x / 16) * 32); float * B_shared_ptr = B_shared + (threadIdx_x * 4); float * B_shared_reduce_ptr = B_shared + (threadIdx_x % 16); @@ -867,7 +867,7 @@ __global__ void __launch_bounds__(128) conv_forward_cuda_setting3_mode0_f32f32f3 #pragma unroll for (int k_0 = 0; k_0 < K_loops; ++k_0) { - int channel_offset = k_0 % (K_original / 32) * 32 + channel_offset_A; + int channel_offset = k_0 % (K_original / 32) * 32 + channel_offset_A; int kernel_offset = k_0 / (K_original / 32); int *out_in_map_ptr_k = out_in_map_ptr + kernel_offset; @@ -877,8 +877,8 @@ __global__ void __launch_bounds__(128) conv_forward_cuda_setting3_mode0_f32f32f3 for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0) { - int input_idx = *(out_in_map_ptr_k + (ax0_ax1_fused_0 *16) * kernel_volume); - if (input_idx != -1) + int input_idx = *(out_in_map_ptr_k + (ax0_ax1_fused_0 *16) * kernel_volume); + if (input_idx != -1) { *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 512)) = // ax0_ax1_fused_0 * elements loaded in each loop @@ -893,27 +893,27 @@ __global__ void __launch_bounds__(128) conv_forward_cuda_setting3_mode0_f32f32f3 } #pragma unroll - for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1) + for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1) { *(float4*)(B_shared_ptr + (ax0_ax1_fused_0_1 * 512)) = // ax0_ax1_fused_0_1 * elements loaded in each loop - *(float4*)(B_ptr + ((k_0 * 32) + (ax0_ax1_fused_0_1 * 8)) * N); + *(float4*)(B_ptr + ((k_0 * 32) + (ax0_ax1_fused_0_1 * 8)) * N); } __syncthreads(); #pragma unroll - for (int k_1 = 0; k_1 < ( 32 / 4); ++k_1) + for (int k_1 = 0; k_1 < ( 32 / 4); ++k_1) { #pragma unroll - for (int k_2 = 0; k_2 < 4; ++k_2) + for (int k_2 = 0; k_2 < 4; ++k_2) { int vk_in_block = (k_1 << 2) + k_2; #pragma unroll - for (int i = 0; i < 64; ++i) + for (int i = 0; i < 64; ++i) { - C_local[i] = C_local[i] + - A_shared_reduce_ptr[((i / 4) * 8) * 32 + vk_in_block] + C_local[i] = C_local[i] + + A_shared_reduce_ptr[((i / 4) * 8) * 32 + vk_in_block] * B_shared_reduce_ptr[(vk_in_block * 64) + ((i % 4) * 16)]; } @@ -923,10 +923,10 @@ __global__ void __launch_bounds__(128) conv_forward_cuda_setting3_mode0_f32f32f3 } #pragma unroll - for (int i = 0; i < 64; ++i) + for (int i = 0; i < 64; ++i) { int location_cur = location_offset + ((i / 4) * 8); - int vn = C_n_offset + ((i % 4) * 16); + int vn = C_n_offset + ((i % 4) * 16); if (location_cur < M) C[location_cur * N + vn] = C_local[i]; } @@ -944,10 +944,10 @@ void conv_forward_implicit_gemm_cuda( auto compute_capability = dev_ctx.GetComputeCapability(); bool allow_fp16 = compute_capability >= 75; bool is_half = _in_feats.dtype() == phi::DataType::FLOAT16; - + int num_in_feats = _in_feats.dims()[0]; int num_in_channels = _in_feats.dims()[1]; - + int kernel_volume = _out_in_map.dims()[1]; auto out_in_map = const_cast(_out_in_map.data()); @@ -1141,7 +1141,7 @@ void conv_forward_implicit_gemm_cuda( { int block_num_M = (num_out_feats + 127) / 128; int block_num_N = num_out_channels / 64; //j_factors1 - dim3 num_blocks(block_num_M * block_num_N); + dim3 num_blocks(block_num_M * block_num_N); dim3 threads_per_block(128); conv_forward_cuda_setting3_mode0_f32f32f32<<>>( _out_feats.dims()[0], num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats); @@ -1150,7 +1150,7 @@ void conv_forward_implicit_gemm_cuda( { int block_num_M = (num_out_feats + 127) / 128; int block_num_N = num_out_channels / 16; //j_factors1 - dim3 num_blocks(block_num_M * block_num_N); + dim3 num_blocks(block_num_M * block_num_N); dim3 threads_per_block(64); conv_forward_cuda_setting2_mode0_f32f32f32<<>>( _out_feats.dims()[0], num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats); @@ -1159,7 +1159,7 @@ void conv_forward_implicit_gemm_cuda( { int block_num_M = (num_out_feats + 127) / 128; int block_num_N = (num_out_channels + 15) / 16; //j_factors1 - dim3 num_blocks(block_num_M * block_num_N); + dim3 num_blocks(block_num_M * block_num_N); dim3 threads_per_block(64); if (num_in_channels % 16 == 0) diff --git a/paddle/phi/kernels/sparse/gpu/sparse_conv_hashmap.cuh b/paddle/phi/kernels/sparse/gpu/sparse_conv_hashmap.cuh index 73ad53de502da..380abb419b40a 100644 --- a/paddle/phi/kernels/sparse/gpu/sparse_conv_hashmap.cuh +++ b/paddle/phi/kernels/sparse/gpu/sparse_conv_hashmap.cuh @@ -65,7 +65,7 @@ class GPUHashTable { key_type* table_keys; val_type* table_vals; void insert_many_coords(const phi::GPUContext& dev_ctx, const int *coords, const int n); - void lookup_many_coords(const phi::GPUContext& dev_ctx, const int *coords, val_type *results, + void lookup_many_coords(const phi::GPUContext& dev_ctx, const int *coords, val_type *results, const int* kernel_sizes, const int* tensor_strides, const int n, const int kernel_volume); public: @@ -112,8 +112,8 @@ __global__ void insert_coords_kernel(key_type* table_keys, val_type* table_vals, template __global__ void lookup_coords_kernel( - key_type* table_keys, val_type* table_vals, const int* coords, val_type* vals, - const int* kernel_sizes, const int* strides, + key_type* table_keys, val_type* table_vals, const int* coords, val_type* vals, + const int* kernel_sizes, const int* strides, int n, int _capacity, int kernel_volume, int _width) { int tidx = blockIdx.x * blockDim.x + threadIdx.x; @@ -125,8 +125,8 @@ __global__ void lookup_coords_kernel( //coords_out[2] = in_coords[2]; //coords_out[3] = in_coords[3]; coords_out[0] = in_coords[0]; - - if constexpr (odd) + + if constexpr (odd) { #pragma unroll for(int i = 0; i <= _width-2; i++){ @@ -146,7 +146,7 @@ __global__ void lookup_coords_kernel( _kernel_idx /= kernel_sizes[i]; } } - + if (idx < n) { key_type key = (key_type)(hash_func_64b(coords_out, _width)); @@ -156,7 +156,7 @@ __global__ void lookup_coords_kernel( { key_type cur_key = table_keys[slot]; if (key == cur_key) - { + { vals[idx * kernel_volume + kernel_idx] = table_vals[slot] - 1; // need to subtract 1 to avoid extra operations in python } if (table_keys[slot] == EMPTY_CELL) @@ -181,7 +181,7 @@ void GPUHashTable::insert_coords(const phi::GPUContext& dev_ template void GPUHashTable::lookup_many_coords( const phi::GPUContext& dev_ctx, - const int* coords, val_type* results, + const int* coords, val_type* results, const int* kernel_sizes, const int* strides, const int n, const int kernel_volume){ if (kernel_volume % 2) diff --git a/paddle/phi/kernels/xpu/rms_norm_kernel.cc b/paddle/phi/kernels/xpu/rms_norm_kernel.cc index 698b2b195da82..85a4ea7291a14 100644 --- a/paddle/phi/kernels/xpu/rms_norm_kernel.cc +++ b/paddle/phi/kernels/xpu/rms_norm_kernel.cc @@ -63,10 +63,10 @@ void RmsNormKernel(const Context& dev_ctx, const T* norm_weight_data = norm_weight.data(); const T* norm_bias_data = norm_bias ? norm_bias.get().data() : nullptr; // float* inv_var_data = nullptr; - if (inv_var != nullptr) { - // inv_var_data = dev_ctx.template Alloc(inv_var); - PD_THROW("rms_norm in XPU kernel does not support inv_var output"); - } + // if (inv_var != nullptr) { + // inv_var_data = dev_ctx.template Alloc(inv_var); + // PD_THROW("rms_norm in XPU kernel does not support inv_var output"); + // } int32_t rows = 1; int32_t cols = 1; diff --git a/paddle/phi/kernels/xpu/swiglu_kernel.cc b/paddle/phi/kernels/xpu/swiglu_kernel.cc index a7815931fa6a8..9ba9c10ea1a43 100644 --- a/paddle/phi/kernels/xpu/swiglu_kernel.cc +++ b/paddle/phi/kernels/xpu/swiglu_kernel.cc @@ -50,7 +50,7 @@ void SwiGluKernel(const Context& ctx, reinterpret_cast(z_data), dims_vec, axis, - false, + true, const_nullptr, nullptr, y_ptr); diff --git a/paddle/phi/kernels/xpu/swiglu_kernel_grad.cc b/paddle/phi/kernels/xpu/swiglu_kernel_grad.cc index 994699a9fa63a..290081a48f36d 100644 --- a/paddle/phi/kernels/xpu/swiglu_kernel_grad.cc +++ b/paddle/phi/kernels/xpu/swiglu_kernel_grad.cc @@ -64,7 +64,7 @@ void SwiGluGradKernel(const Context& ctx, reinterpret_cast(dx_data), dims_vec, axis, - false, + true, y_ptr, dy_ptr); PADDLE_ENFORCE_XDNN_SUCCESS(ret, "swiglu_grad"); diff --git a/paddle/phi/kernels/xpu/tile_kernel.cc b/paddle/phi/kernels/xpu/tile_kernel.cc index 5e665711efc8d..6b8dbf641f803 100644 --- a/paddle/phi/kernels/xpu/tile_kernel.cc +++ b/paddle/phi/kernels/xpu/tile_kernel.cc @@ -143,4 +143,5 @@ PD_REGISTER_KERNEL(tile, double, int, int64_t, - phi::dtype::bfloat16) {} + phi::dtype::bfloat16, + phi::dtype::float16) {} diff --git a/paddle/phi/ops/yaml/backward.yaml b/paddle/phi/ops/yaml/backward.yaml index 934e55ad90a92..702745d436beb 100644 --- a/paddle/phi/ops/yaml/backward.yaml +++ b/paddle/phi/ops/yaml/backward.yaml @@ -1067,6 +1067,7 @@ infer_meta : func : KernelWithXShapeInferMeta param : [xshape, out_grad] + spmd_rule : FlattenGradInferSpmd kernel : func : flatten_grad data_type : out_grad @@ -1825,6 +1826,22 @@ kernel : func : logsumexp_grad +- backward_op : lstm_grad + forward: lstm (Tensor input, Tensor h0, Tensor c0, Tensor weight, Tensor bias, bool use_peepholes + = true, bool is_reverse = false, bool is_test = false, str gate_activation = "sigmoid", + str cell_activation = "tanh", str candidate_activation = "tanh") -> Tensor (hidden), Tensor (cell), Tensor (batch_gate), Tensor (batch_cell_pre_act) + args: (Tensor input, Tensor h0, Tensor c0, Tensor weight, Tensor bias, Tensor hidden, Tensor cell, + Tensor batch_gate, Tensor batch_cell_pre_act, Tensor hidden_grad, bool use_peepholes, bool is_reverse, bool is_test, str gate_activation, + str cell_activation, str candidate_activation) + output: Tensor(input_grad), Tensor(h0_grad), Tensor(c0_grad), Tensor(weight_grad), Tensor(bias_grad) + infer_meta: + func: LSTMGradInferMeta + param: [input, h0, c0, weight, bias] + kernel: + func: lstm_grad + data_type: input + optional: h0, c0 + - backward_op : lu_grad forward : lu (Tensor x, bool pivot = true) -> Tensor(out), Tensor(pivots), Tensor(infos) args : (Tensor x, Tensor out, Tensor pivots, Tensor out_grad, bool pivot) diff --git a/paddle/phi/ops/yaml/fused_ops.yaml b/paddle/phi/ops/yaml/fused_ops.yaml index 5db39e9d207d7..1aac7524d84ab 100644 --- a/paddle/phi/ops/yaml/fused_ops.yaml +++ b/paddle/phi/ops/yaml/fused_ops.yaml @@ -56,6 +56,20 @@ data_transform : skip_transform : max_enc_len_this_time, max_dec_len_this_time +- op : block_multihead_attention_xpu + args : (Tensor qkv, Tensor key_cache, Tensor value_cache, Tensor seq_lens_encoder, Tensor seq_lens_decoder, Tensor seq_lens_this_time, Tensor padding_offsets, Tensor cum_offsets, Tensor cu_seqlens_q, Tensor cu_seqlens_k, Tensor block_tables, Tensor cache_k_per_batch_maxs, Tensor cache_v_per_batch_maxs, Tensor pre_key_cache, Tensor pre_value_cache, Tensor rope_emb, Tensor mask, Tensor tgt_mask, Tensor cache_k_quant_scales, Tensor cache_v_quant_scales, Tensor cache_k_dequant_scales, Tensor cache_v_dequant_scales, Tensor qkv_out_scale, Tensor qkv_bias, Tensor out_shift, Tensor out_smooth, Tensor max_enc_len_this_time, Tensor max_dec_len_this_time, int max_seq_len, int block_size, bool use_neox_style, bool dynamic_cachekv_quant=false, int quant_round_type=1, float quant_max_bound=127.0, float quant_min_bound=-127.0, float out_scale=-1, str compute_dtype = "default") + output : Tensor(fmha_out), Tensor(qkv_out), Tensor(key_cache_out), Tensor(value_cache_out) + infer_meta : + func : BlockMultiheadAttentionInferXPUMeta + kernel : + func : block_multihead_attention_xpu + data_type : qkv + optional : pre_key_cache, pre_value_cache, rope_emb, mask, tgt_mask, cache_k_quant_scales, cache_v_quant_scales, cache_k_dequant_scales, cache_v_dequant_scales, qkv_out_scale, qkv_bias, out_shift, out_smooth, max_enc_len_this_time, max_dec_len_this_time + inplace : (qkv -> qkv_out), (key_cache -> key_cache_out), (value_cache -> value_cache_out) + support_dygraph_mode : true + data_transform : + skip_transform : max_enc_len_this_time, max_dec_len_this_time + - op : bn_act_xpu args : (Tensor x, Tensor mean, Tensor variance, Tensor scale, Tensor bias, float momentum, float epsilon, str data_format, int act_type) output : Tensor(out) @@ -400,6 +414,16 @@ func: fused_token_prune support_dygraph_mode : true +- op : fusion_group + args: (Tensor[] inputs, int[] outs_dtype = {}, int[] inputs_dtype = {}, str func_name = "", int type + = 0) + output: Tensor[] (outs){inputs.size()} + infer_meta: + func: FusionGroupInferMeta + kernel: + func: fusion_group + data_type : DataType::FLOAT32 + - op : fusion_gru args : (Tensor x, Tensor h0, Tensor weight_x, Tensor weight_h, Tensor bias, str activation = "tanh", str gate_activation = "sigmoid", bool is_reverse = false, bool use_seq = true, bool origin_mode = false, bool force_fp32_output = false) output : Tensor(reordered_h0), Tensor(xx), Tensor(batched_input), Tensor(batched_out), Tensor(hidden) @@ -685,3 +709,15 @@ func : yolo_box_xpu data_type : x optional : x_max + +- op: add_group_norm_silu + args : (Tensor x,Tensor residual, Tensor scale, Tensor bias, float epsilon = 1e-5, int groups = -1, str data_format = "NCHW", str activation = "") + output : Tensor(y), Tensor(residual_out), Tensor(mean), Tensor(variance) + infer_meta : + func : AddGroupNormSiluInferMeta + kernel : + func : add_group_norm_silu + data_type : x + optional : scale, bias, residual, residual_out + support_dygraph_mode : true + interfaces : paddle::dialect::LayoutTransformationInterface diff --git a/paddle/phi/ops/yaml/inconsistent/static_ops.yaml b/paddle/phi/ops/yaml/inconsistent/static_ops.yaml index ddfe98cefcc80..2f59244893ffc 100644 --- a/paddle/phi/ops/yaml/inconsistent/static_ops.yaml +++ b/paddle/phi/ops/yaml/inconsistent/static_ops.yaml @@ -49,14 +49,6 @@ inplace : (x -> out) interfaces : paddle::dialect::InferSymbolicShapeInterface, paddle::dialect::LayoutTransformationInterface -- op : assign_pos - args : (Tensor x, Tensor cum_count, Tensor eff_num_len) - output : Tensor(out) - infer_meta : - func : AssignPosInferMeta - kernel : - func : assign_pos - - op : assign_value args : (int[] shape, DataType dtype, Scalar[] values, Place place = {}) output : Tensor(out) @@ -196,15 +188,6 @@ data_type : dtype inplace: (input -> output) -- op : decayed_adagrad - args : (Tensor param, Tensor grad, Tensor moment, Tensor learning_rate, float decay = 0.95f, float epsilon = 1.0e-6f) - output : Tensor(param_out), Tensor(moment_out) - infer_meta : - func : DecayedAdagradInferMeta - kernel : - func : decayed_adagrad - data_type : param - - op : dequantize_linear args : (Tensor x, Tensor scale, Tensor zero_point, Tensor in_accum, Tensor in_state, int quant_axis = 0, int bit_length = 8, int round_type = 0, bool is_test = true, bool only_observer = false) output : Tensor(y), Tensor(out_state), Tensor(out_accum), Tensor(out_scale) @@ -859,16 +842,6 @@ backward : subtract_grad interfaces : paddle::dialect::InferSymbolicShapeInterface -- op : tdm_sampler - args: (Tensor x, Tensor travel, Tensor layer, bool output_positive=true, int[] neg_samples_num_list={}, int[] layer_offset_lod={}, int seed = 0, int dtype=2) - output: Tensor(out), Tensor(labels), Tensor(mask) - infer_meta: - func : TdmSamplerInferMeta - kernel: - func : tdm_sampler - data_type : x - optional : labels - - op : tile args : (Tensor x, IntArray repeat_times = {}) output : Tensor(out) @@ -1001,15 +974,6 @@ optional: bias, sample_weight, custom_dist_probs, custom_dist_alias, custom_dist_alias_probs backward: nce_grad -- op: number_count - args: (Tensor numbers, int upper_range) - output: Tensor(out) - infer_meta: - func: NumberCountInferMeta - kernel: - func: number_count - data_type: numbers - - op: onednn_to_paddle_layout args: (Tensor x, int dst_layout) output: Tensor(out) diff --git a/paddle/phi/ops/yaml/legacy/backward_exclude.yaml b/paddle/phi/ops/yaml/legacy/backward_exclude.yaml index 335952bc3475c..9a327ef5dd4b3 100644 --- a/paddle/phi/ops/yaml/legacy/backward_exclude.yaml +++ b/paddle/phi/ops/yaml/legacy/backward_exclude.yaml @@ -5,7 +5,6 @@ - amax_grad - amin_grad - cast_grad -- channel_shuffle_grad - conv2d_transpose_double_grad - conv2d_transpose_grad - deformable_conv_grad @@ -34,7 +33,6 @@ - repeat_interleave_grad - repeat_interleave_with_tensor_index_grad - rnn_grad -- rrelu_grad - set_value_with_tensor_grad - slice_double_grad - slice_grad diff --git a/paddle/phi/ops/yaml/legacy/ops_exclude.yaml b/paddle/phi/ops/yaml/legacy/ops_exclude.yaml index 160e33c5b36c8..703c948240df0 100644 --- a/paddle/phi/ops/yaml/legacy/ops_exclude.yaml +++ b/paddle/phi/ops/yaml/legacy/ops_exclude.yaml @@ -22,7 +22,6 @@ - c_sync_calc_stream - c_sync_comm_stream - cast -- channel_shuffle - conv2d_transpose - conv2d_transpose_bias - copy_to @@ -75,7 +74,6 @@ - repeat_interleave - repeat_interleave_with_tensor_index - rnn -- rrelu - sequence_mask - set_value_with_tensor - slice diff --git a/paddle/phi/ops/yaml/legacy/static_ops.yaml b/paddle/phi/ops/yaml/legacy/static_ops.yaml index 1280fd3716f0a..d9d0c222b770f 100755 --- a/paddle/phi/ops/yaml/legacy/static_ops.yaml +++ b/paddle/phi/ops/yaml/legacy/static_ops.yaml @@ -699,6 +699,14 @@ func : swish backward : swish_grad +- op : transfer_layout + args: (Tensor x, int src_layout = -1, int dst_layout=-1) + output: Tensor (out) + infer_meta: + func: TransferLayoutInferMeta + kernel: + func: transfer_layout + - op : tril_indices args : (int rows = 0, int cols = 0, int offset = 0, DataType dtype = DataType::INT64) output : Tensor(out) diff --git a/paddle/phi/ops/yaml/op_compat.yaml b/paddle/phi/ops/yaml/op_compat.yaml index 2f7af0b64c802..6fb5afeb87a07 100755 --- a/paddle/phi/ops/yaml/op_compat.yaml +++ b/paddle/phi/ops/yaml/op_compat.yaml @@ -4062,6 +4062,12 @@ outputs : {slimmed_x : SlimmedX, cls_inds : CLSInds} +- op: fusion_group + inputs: + inputs : Inputs + outputs: + outs : Outs + - op: fusion_seqpool_cvm_concat inputs: {x : X, cvm : CVM} @@ -4129,6 +4135,15 @@ outputs: {out: Out} +- op: lstm + backward: lstm_grad + inputs: + {input : Input, h0 : H0, c0 : C0, weight : Weight, bias : Bias} + outputs: + {hidden : Hidden, cell : Cell, batch_gate : BatchGate, batch_cell_pre_act : BatchCellPreAct} + extra: + outputs: [batch_gate, batch_cell_pre_act] + - op: lu backward: lu_grad inputs: @@ -4250,6 +4265,8 @@ {x: X} outputs: {out: Out, noise: Noise} + extra: + outputs: [noise] - op: send_v2 inputs : @@ -4355,6 +4372,12 @@ outputs : out : Out +- op: transfer_layout + inputs: + x : X + outputs: + out : Out + - op: uniform_random_batch_size_like inputs: input : Input diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml index e758d5e0438f0..21aa2868fb8b2 100755 --- a/paddle/phi/ops/yaml/ops.yaml +++ b/paddle/phi/ops/yaml/ops.yaml @@ -321,6 +321,14 @@ backward : assign_out__grad traits : pir::SideEffectTrait +- op : assign_pos + args : (Tensor x, Tensor cum_count, Tensor eff_num_len) + output : Tensor(out) + infer_meta : + func : AssignPosInferMeta + kernel : + func : assign_pos + - op : assign_value_ args : (Tensor output, int[] shape, DataType dtype, Scalar[] values, Place place = {}) output : Tensor(out) @@ -760,6 +768,7 @@ kernel : func : class_center_sample data_type : label + traits : pir::SideEffectTrait - op : clip args : (Tensor x, Scalar(float) min, Scalar(float) max) @@ -1052,6 +1061,15 @@ backend : place interfaces : paddle::dialect::InferSymbolicShapeInterface +- op : decayed_adagrad + args : (Tensor param, Tensor grad, Tensor moment, Tensor learning_rate, float decay = 0.95f, float epsilon = 1.0e-6f) + output : Tensor(param_out), Tensor(moment_out) + infer_meta : + func : DecayedAdagradInferMeta + kernel : + func : decayed_adagrad + data_type : param + - op : decode_jpeg args : (Tensor x, str mode, Place place) output : Tensor(out) @@ -1262,6 +1280,7 @@ optional : seed_tensor intermediate : mask backward : dropout_grad + traits : pir::SideEffectTrait - op : edit_distance args : (Tensor hyps, Tensor refs, Tensor hypslength, Tensor refslength, bool normalized = false) @@ -1672,6 +1691,7 @@ output : Tensor(out), Tensor(xshape) infer_meta : func : FlattenWithXShapeInferMeta + spmd_rule : FlattenInferSpmd kernel : func : flatten data_type : x @@ -2658,6 +2678,20 @@ backward : logsumexp_grad interfaces : paddle::dialect::InferSymbolicShapeInterface +- op : lstm + args: (Tensor input, Tensor h0, Tensor c0, Tensor weight, Tensor bias, bool use_peepholes + = true, bool is_reverse = false, bool is_test = false, str gate_activation = "sigmoid", + str cell_activation = "tanh", str candidate_activation = "tanh") + output: Tensor (hidden), Tensor (cell), Tensor (batch_gate), Tensor (batch_cell_pre_act) + infer_meta: + func: LSTMInferMeta + kernel: + func: lstm + data_type: input + optional: h0, c0 + intermediate: batch_gate, batch_cell_pre_act + backward: lstm_grad + - op : lstsq args : (Tensor x, Tensor y, Scalar rcond=0.0f, str driver="gels") output : Tensor(solution), Tensor(residuals), Tensor(rank), Tensor(singular_values) @@ -3584,7 +3618,7 @@ traits : pir::SideEffectTrait - op : rrelu - args : (Tensor x, float lower, float upper, bool is_test) + args : (Tensor x, float lower=1.0f/8, float upper=1.0f/3, bool is_test=false) output : Tensor(out), Tensor(noise) infer_meta : func : RReluInferMeta @@ -4148,6 +4182,25 @@ func : tanh_shrink backward : tanh_shrink_grad +- op : tdm_child + args: (Tensor x, Tensor tree_info, int child_nums, DataType dtype = DataType::INT32) + output: Tensor (child), Tensor (leaf_mask) + infer_meta: + func: TdmChildInferMeta + kernel: + func: tdm_child + data_type: x + +- op : tdm_sampler + args: (Tensor x, Tensor travel, Tensor layer, bool output_positive=true, int[] neg_samples_num_list={}, int[] layer_offset_lod={}, int seed = 0, int dtype=2) + output: Tensor(out), Tensor(labels), Tensor(mask) + infer_meta: + func : TdmSamplerInferMeta + kernel: + func : tdm_sampler + data_type : x + optional : labels + - op : temporal_shift args : (Tensor x, int seg_num, float shift_ratio = 0.25f, str data_format = "NCHW") output : Tensor(out) @@ -4374,6 +4427,7 @@ data_type: x inplace: (x -> out) backward: uniform_inplace_grad + traits : pir::SideEffectTrait - op : uniform_random_batch_size_like args: (Tensor input, int[] shape, int input_dim_idx = 0, int output_dim_idx = 0, @@ -4386,6 +4440,7 @@ uniform_random_batch_size_like_sr {selected_rows -> selected_rows} data_type: dtype no_need_buffer: input + traits : pir::SideEffectTrait - op : unique_consecutive args : (Tensor x, bool return_inverse = false, bool return_counts = false, int[] axis = {}, DataType dtype = DataType::FLOAT32) @@ -4631,3 +4686,12 @@ func: MoeInferMeta kernel: func: moe + +- op: number_count + args: (Tensor numbers, int upper_range) + output: Tensor(out) + infer_meta: + func: NumberCountInferMeta + kernel: + func: number_count + data_type: numbers diff --git a/paddle/pir/include/core/program.h b/paddle/pir/include/core/program.h index d838916eefea5..4d0da62a98c84 100644 --- a/paddle/pir/include/core/program.h +++ b/paddle/pir/include/core/program.h @@ -57,6 +57,7 @@ class IR_API Program { std::shared_ptr Clone(IrMapping& ir_mapping) const; // NOLINT + void CopyToBlock(IrMapping& ir_mapping, Block* insert_block) const; // NOLINT Block* block() { return &module_.block(); } const Block* block() const { return &module_op().block(); } @@ -70,9 +71,13 @@ class IR_API Program { parameters_ = parameters; } + uint64_t id() const { return id_; } + private: // computation graph ModuleOp module_; + // unique in current process, "almost" unique between processes. + uint64_t id_; // weight ParameterMap parameters_; }; diff --git a/paddle/pir/include/dialect/shape/utils/shape_analysis.h b/paddle/pir/include/dialect/shape/utils/shape_analysis.h index bbdda621511eb..0256d97dbc2b1 100644 --- a/paddle/pir/include/dialect/shape/utils/shape_analysis.h +++ b/paddle/pir/include/dialect/shape/utils/shape_analysis.h @@ -42,7 +42,7 @@ class IR_API InferSymbolicShapeContext { const symbol::ShapeOrDataDimExprs& GetShapeOrDataForValue(Value val) const; - void SetStaticShapeForValue(Value val); + void SetSymbolForValueByStaticShape(Value val); void SetShapeOrDataForValue(Value val, const symbol::ShapeOrDataDimExprs& shape_or_data); @@ -150,7 +150,7 @@ class IR_API ShapeConstraintIRAnalysis final friend void InferSymExprForAllValues(ModuleOp module_op); - void SetStaticShapeForValue(Value val); + void SetSymbolForValueByStaticShape(Value val); void InferShapeOrDataForValue(Value val); diff --git a/paddle/pir/src/core/program.cc b/paddle/pir/src/core/program.cc index 19d08f094fd4c..453cf3eb170df 100644 --- a/paddle/pir/src/core/program.cc +++ b/paddle/pir/src/core/program.cc @@ -13,13 +13,48 @@ // limitations under the License. #include "paddle/pir/include/core/program.h" +#include +#include +#include +#include #include "glog/logging.h" #include "paddle/pir/include/core/ir_context.h" namespace pir { +namespace { + +int64_t GetRandomId() { + std::random_device rd{}; + std::mt19937_64 gen(rd()); + std::uniform_int_distribution dis( + 0, std::numeric_limits::max()); + return dis(gen); +} + +bool InsertGlobalStorageSuccess(int64_t random_id) { + static std::unordered_set storage; + static std::mutex mutex; + std::unique_lock lock(mutex); + return storage.emplace(random_id).second; +} + +int64_t GetUniqueRandomId() { + int kLimit = 100; + for (int i = 0; i < kLimit; ++i) { + int64_t random_id = GetRandomId(); + if (InsertGlobalStorageSuccess(random_id)) { + return random_id; + } + } + LOG(FATAL) << "Fatal bug occured in GetUniqueRandomId()."; +} + +} // namespace + Program::Program(IrContext* context) { module_ = ModuleOp::Create(context, this); + id_ = GetUniqueRandomId(); } Program::~Program() { @@ -39,6 +74,26 @@ std::shared_ptr Program::Clone(IrMapping& ir_mapping) const { return new_program; } +void Program::CopyToBlock(IrMapping& ir_mapping, Block* insert_block) const { + auto clone_options = CloneOptions::All(); + for (const auto& op : *block()) { + bool skip_op = false; + for (uint32_t i = 0; i < op.num_results(); i++) { + if (ir_mapping.GetMutableMap().count(op.result(i))) { + skip_op = true; + break; + } + } + if (skip_op) { + continue; + } + + auto* new_op = op.Clone(ir_mapping, clone_options); + insert_block->push_back(new_op); + } + return; +} + Parameter* Program::GetParameter(const std::string& name) const { if (parameters_.count(name) != 0) { return parameters_.at(name).get(); diff --git a/paddle/pir/src/dialect/shape/transforms/shape_optimization_pass.cc b/paddle/pir/src/dialect/shape/transforms/shape_optimization_pass.cc index 343b1bf329c2c..e51cf34aa4bc9 100644 --- a/paddle/pir/src/dialect/shape/transforms/shape_optimization_pass.cc +++ b/paddle/pir/src/dialect/shape/transforms/shape_optimization_pass.cc @@ -126,6 +126,10 @@ void DebugPrintOpInfo(pir::Operation* op, std::ostringstream print_stream; for (uint32_t i = 0; i < op->num_results(); ++i) { const auto& res = op->result(i); + if (!res || !res.type()) { + continue; + } + print_stream << "\tresult(" << res.dyn_cast().index() << ") " << "ShapeOrData: {"; @@ -170,6 +174,10 @@ void CheckInferSymWithInferMeta( pir::InferSymbolicShapeContext* infer_context = nullptr) { for (uint32_t i = 0; i < op->num_results(); ++i) { const auto& res = op->result(i); + if (!res || !res.type()) { + continue; + } + std::ostringstream print_stream; // InferMeta funcs of some Ops are not corrrect now, we don't check them. @@ -299,7 +307,7 @@ void InferSymExprForBlock(const Block& block, << " DOES NOT have InferSymbolicShapeInterface!"; } for (uint32_t i = 0; i < op.num_results(); ++i) { - infer_context->SetStaticShapeForValue(op.result(i)); + infer_context->SetSymbolForValueByStaticShape(op.result(i)); } } DebugPrintOpInfo(&op, infer_context); @@ -314,6 +322,9 @@ void InferSymExprForAllValues(ModuleOp module_op) { auto infer_context = shape_analysis.MutInferSymbolicShapeContext(); for (uint32_t i = 0; i < module_op->num_regions(); i++) { for (auto& block : module_op->region(i)) { + for (auto& [_, value] : block.kwargs()) { + infer_context->SetSymbolForValueByStaticShape(value); + } InferSymExprForBlock(block, infer_context); } } diff --git a/paddle/pir/src/dialect/shape/utils/shape_analysis.cc b/paddle/pir/src/dialect/shape/utils/shape_analysis.cc index b62ad0f2a3d95..d73908b0db0b4 100644 --- a/paddle/pir/src/dialect/shape/utils/shape_analysis.cc +++ b/paddle/pir/src/dialect/shape/utils/shape_analysis.cc @@ -67,14 +67,15 @@ InferSymbolicShapeContext::GetShapeOrDataForValue(Value val) const { return value_id_to_shape_or_data_.at(val.impl()->id()); } -void InferSymbolicShapeContext::SetStaticShapeForValue(Value val) { +void InferSymbolicShapeContext::SetSymbolForValueByStaticShape(Value val) { const auto& value_type = val.type(); if (!val || !value_type) { - PADDLE_THROW( - phi::errors::Fatal("Set static shape for null value is FOBBIDEN!")); + LOG(WARNING) << "Risk on SetSymbolForValueByStaticShape for null value"; + return; } if (!IsStaticShape(val)) { - LOG(WARNING) << "Risk on SetStaticShapeForValue for contain_unknown_dim"; + LOG(WARNING) + << "Risk on SetSymbolForValueByStaticShape for contain_unknown_dim"; } const auto& GetStaticShapeForDenseTensorType = [&](DenseTensorType type_info) -> symbol::TensorShapeOrDataDimExprs { @@ -289,8 +290,8 @@ const std::string ShapeConstraintIRAnalysis::GetNextSymName() { return context_.GetNextSymName(); } -void ShapeConstraintIRAnalysis::SetStaticShapeForValue(Value val) { - context_.SetStaticShapeForValue(val); +void ShapeConstraintIRAnalysis::SetSymbolForValueByStaticShape(Value val) { + context_.SetSymbolForValueByStaticShape(val); } void ShapeConstraintIRAnalysis::InferShapeOrDataForValue(Value val) { @@ -319,7 +320,7 @@ void ShapeConstraintIRAnalysis::InferShapeOrDataForValue(Value val) { for (auto& operand : GetRealOperandSource(op)) { if (operand.impl() && !context_.HasShapeOrDataForValue(operand)) { if (!operand.defining_op()) { - SetStaticShapeForValue(operand); + SetSymbolForValueByStaticShape(operand); } else { Visit(operand.defining_op()); } @@ -334,7 +335,7 @@ void ShapeConstraintIRAnalysis::InferShapeOrDataForValue(Value val) { for (auto& operand : GetRealOperandSource(op)) { if (operand.impl() && !context_.HasShapeOrDataForValue(operand)) { if (!operand.defining_op()) { - SetStaticShapeForValue(operand); + SetSymbolForValueByStaticShape(operand); } else { has_prev_op = true; } @@ -394,7 +395,7 @@ void ShapeConstraintIRAnalysis::InferShapeOrDataForValue(Value val) { << " DOES NOT have InferSymbolicShapeInterface!"; for (auto& result_value : op->results()) { if (result_value && (!context_.HasShapeOrDataForValue(result_value))) { - SetStaticShapeForValue(result_value); + SetSymbolForValueByStaticShape(result_value); } } } @@ -412,7 +413,7 @@ ShapeConstraintIRAnalysis::GetShapeOrDataForValue(Value val) { if (!context_.HasShapeOrDataForValue(val)) { // backtrack to infer shape from defining op if (!val.defining_op()) { - SetStaticShapeForValue(val); + SetSymbolForValueByStaticShape(val); } else { VLOG(3) << "InferShapeOrDataForValue, defining_op: " << val.defining_op()->name(); diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index 45b796671852e..7fcb1898bbe62 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -94,6 +94,7 @@ if not defined retry_times set retry_times=1 if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python38 if not defined BUILD_DIR set BUILD_DIR=build if not defined TEST_INFERENCE set TEST_INFERENCE=ON +if not defined WITH_PIP_CUDA_LIBRARIES set WITH_PIP_CUDA_LIBRARIES=OFF set task_name=%1 set UPLOAD_TP_FILE=OFF @@ -301,6 +302,7 @@ rem ------Build windows avx whl package------ :CASE_build_avx_whl set WITH_AVX=ON set ON_INFER=ON +set WITH_PIP_CUDA_LIBRARIES=ON if not defined CUDA_ARCH_NAME set CUDA_ARCH_NAME=All call :cmake || goto cmake_error @@ -515,7 +517,7 @@ echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -D -DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% -DCUDA_ARCH_BIN=%CUDA_ARCH_BIN% -DCUB_PATH=%THIRD_PARTY_HOME%/cub ^ -DCUDA_TOOLKIT_ROOT_DIR="%CUDA_TOOLKIT_ROOT_DIR%" -DNEW_RELEASE_ALL=%NEW_RELEASE_ALL% -DNEW_RELEASE_PYPI=%NEW_RELEASE_PYPI% ^ -DNEW_RELEASE_JIT=%NEW_RELEASE_JIT% -DWITH_ONNXRUNTIME=%WITH_ONNXRUNTIME% -DWITH_CPP_TEST=%WITH_CPP_TEST% ^ --DWIN_UNITTEST_LEVEL=%WIN_UNITTEST_LEVEL% -DWITH_NIGHTLY_BUILD=%WITH_NIGHTLY_BUILD% +-DWIN_UNITTEST_LEVEL=%WIN_UNITTEST_LEVEL% -DWITH_NIGHTLY_BUILD=%WITH_NIGHTLY_BUILD% -DWITH_PIP_CUDA_LIBRARIES=%WITH_PIP_CUDA_LIBRARIES% >> %work_dir%\win_cmake.sh echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^ -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^ @@ -525,7 +527,7 @@ echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -D -DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% -DCUDA_ARCH_BIN=%CUDA_ARCH_BIN% -DCUB_PATH=%THIRD_PARTY_HOME%/cub ^ -DCUDA_TOOLKIT_ROOT_DIR="%CUDA_TOOLKIT_ROOT_DIR%" -DNEW_RELEASE_ALL=%NEW_RELEASE_ALL% -DNEW_RELEASE_PYPI=%NEW_RELEASE_PYPI% ^ -DNEW_RELEASE_JIT=%NEW_RELEASE_JIT% -DWITH_ONNXRUNTIME=%WITH_ONNXRUNTIME% -DWITH_CPP_TEST=%WITH_CPP_TEST% ^ --DWIN_UNITTEST_LEVEL=%WIN_UNITTEST_LEVEL% -DWITH_NIGHTLY_BUILD=%WITH_NIGHTLY_BUILD% >> %work_dir%\win_cmake.sh +-DWIN_UNITTEST_LEVEL=%WIN_UNITTEST_LEVEL% -DWITH_NIGHTLY_BUILD=%WITH_NIGHTLY_BUILD% -DWITH_PIP_CUDA_LIBRARIES=%WITH_PIP_CUDA_LIBRARIES% >> %work_dir%\win_cmake.sh cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^ -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^ @@ -535,7 +537,7 @@ cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_ -DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% -DCUDA_ARCH_BIN=%CUDA_ARCH_BIN% -DCUB_PATH=%THIRD_PARTY_HOME%/cub ^ -DCUDA_TOOLKIT_ROOT_DIR="%CUDA_TOOLKIT_ROOT_DIR%" -DNEW_RELEASE_ALL=%NEW_RELEASE_ALL% -DNEW_RELEASE_PYPI=%NEW_RELEASE_PYPI% ^ -DNEW_RELEASE_JIT=%NEW_RELEASE_JIT% -DWITH_ONNXRUNTIME=%WITH_ONNXRUNTIME% -DWITH_CPP_TEST=%WITH_CPP_TEST% ^ --DWIN_UNITTEST_LEVEL=%WIN_UNITTEST_LEVEL% -DWITH_NIGHTLY_BUILD=%WITH_NIGHTLY_BUILD% +-DWIN_UNITTEST_LEVEL=%WIN_UNITTEST_LEVEL% -DWITH_NIGHTLY_BUILD=%WITH_NIGHTLY_BUILD% -DWITH_PIP_CUDA_LIBRARIES=%WITH_PIP_CUDA_LIBRARIES% goto:eof :cmake_error diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 8c0266c36e8c1..e793c210628be 100644 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -1130,7 +1130,10 @@ function check_whl_size() { function generate_upstream_develop_api_spec() { set -x + # Temporarily save some scripts from PR branch cp ${PADDLE_ROOT}/python/requirements.txt /tmp + cp ${PADDLE_ROOT}/tools/print_signatures.py /tmp + mkdir -p ${PADDLE_ROOT}/build/pr_whl && mv ${PADDLE_ROOT}/build/python/dist/*.whl ${PADDLE_ROOT}/build/pr_whl/ pr_whl_size=`du -m ${PADDLE_ROOT}/build/python/dist/*.whl|awk '{print $1}'` echo "pr_whl_size: ${pr_whl_size}" @@ -1178,17 +1181,20 @@ function generate_api_spec() { echo "Not supported $2" exit 1 fi + if [ "$spec_kind" == "DEV" ]; then + REQUIREMENTS_PATH=/tmp/requirements.txt + PRINT_SIGNATURES_SCRIPT_PATH=/tmp/print_signatures.py + else + REQUIREMENTS_PATH=${PADDLE_ROOT}/python/requirements.txt + PRINT_SIGNATURES_SCRIPT_PATH=${PADDLE_ROOT}/tools/print_signatures.py + fi mkdir -p ${PADDLE_ROOT}/build/.check_api_workspace cd ${PADDLE_ROOT}/build/.check_api_workspace virtualenv -p `which python` .${spec_kind}_env source .${spec_kind}_env/bin/activate + pip install -r $REQUIREMENTS_PATH - if [ "$spec_kind" == "DEV" ]; then - pip install -r /tmp/requirements.txt - else - pip install -r ${PADDLE_ROOT}/python/requirements.txt - fi if [ -d "${PADDLE_ROOT}/build/python/dist/" ]; then pip install ${PADDLE_ROOT}/build/python/dist/*whl elif [ -d "${PADDLE_ROOT}/dist/" ];then @@ -1196,7 +1202,10 @@ function generate_api_spec() { mkdir ${PADDLE_ROOT}/build/python/dist/ && mv ${PADDLE_ROOT}/dist/*whl ${PADDLE_ROOT}/build/python/dist/ fi spec_path=${PADDLE_ROOT}/paddle/fluid/API_${spec_kind}.spec - python ${PADDLE_ROOT}/tools/print_signatures.py paddle > $spec_path + python ${PRINT_SIGNATURES_SCRIPT_PATH} paddle > $spec_path + python ${PRINT_SIGNATURES_SCRIPT_PATH} --show-fields="args,varargs,varkw,defaults,kwonlyargs,kwonlydefaults" paddle > ${spec_path}.api + python ${PRINT_SIGNATURES_SCRIPT_PATH} --show-fields="annotations" paddle > ${spec_path}.annotations + python ${PRINT_SIGNATURES_SCRIPT_PATH} --show-fields="document" paddle > ${spec_path}.doc # used to log op_register data_type op_type_path=${PADDLE_ROOT}/paddle/fluid/OP_TYPE_${spec_kind}.spec @@ -1214,9 +1223,6 @@ function generate_api_spec() { api_source_md5_path=${PADDLE_ROOT}/paddle/fluid/API_${spec_kind}.source.md5 python ${PADDLE_ROOT}/tools/count_api_without_core_ops.py -p paddle > $api_source_md5_path - awk -F '(' '{print $NF}' $spec_path >${spec_path}.doc - awk -F '(' '{$NF="";print $0}' $spec_path >${spec_path}.api - python ${PADDLE_ROOT}/tools/diff_use_default_grad_op_maker.py \ ${PADDLE_ROOT}/paddle/fluid/op_use_default_grad_maker_${spec_kind}.spec @@ -1474,7 +1480,7 @@ function card_test() { if [ "${WITH_XPU}" == "ON" ];then CUDA_DEVICE_COUNT=1 elif [ "${WITH_ROCM}" == "ON" ];then - CUDA_DEVICE_COUNT=$(rocm-smi -i | grep GPU | wc -l) + CUDA_DEVICE_COUNT=$(rocm-smi -i | grep DCU | wc -l) elif [ "${WITH_IPU}" == "ON" ];then CUDA_DEVICE_COUNT=1 else @@ -1517,13 +1523,22 @@ function card_test() { if [[ $cardnumber == $CUDA_DEVICE_COUNT ]]; then (ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} -V --timeout 120 -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) & else - (env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} --timeout 120 -V -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) & + if [ "$WITH_ROCM" == "ON" ];then + (env HIP_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} --timeout 120 -V -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) & + else + (env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} --timeout 120 -V -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) & + fi fi else if [[ $cardnumber == $CUDA_DEVICE_COUNT ]]; then (ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} --timeout 120 --output-on-failure -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) & else - (env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} --timeout 120 --output-on-failure -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) & + if [ "$WITH_ROCM" == "ON" ];then + (env HIP_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} --timeout 120 --output-on-failure -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) & + else + (env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} --timeout 120 --output-on-failure -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) & + fi + fi fi done @@ -2652,7 +2667,11 @@ set -x fi if [ -a "$PADDLE_ROOT/added_ut" ];then added_uts=^$(awk BEGIN{RS=EOF}'{gsub(/\n/,"$|^");print}' $PADDLE_ROOT/added_ut)$ - env CUDA_VISIBLE_DEVICES=0 ctest -R "(${added_uts})" -LE "RUN_TYPE=DIST|RUN_TYPE=EXCLUSIVE|RUN_TYPE=HYBRID" --output-on-failure --repeat-until-fail 3 --timeout 15;added_ut_error=$? + if [ "$WITH_ROCM" == "ON" ];then + env HIP_VISIBLE_DEVICES=0 ctest -R "(${added_uts})" -LE "RUN_TYPE=DIST|RUN_TYPE=EXCLUSIVE|RUN_TYPE=HYBRID" --output-on-failure --repeat-until-fail 3 --timeout 15;added_ut_error=$? + else + env CUDA_VISIBLE_DEVICES=0 ctest -R "(${added_uts})" -LE "RUN_TYPE=DIST|RUN_TYPE=EXCLUSIVE|RUN_TYPE=HYBRID" --output-on-failure --repeat-until-fail 3 --timeout 15;added_ut_error=$? + fi ctest -R "(${added_uts})" -L "RUN_TYPE=DIST|RUN_TYPE=EXCLUSIVE" --output-on-failure --repeat-until-fail 3 --timeout 15;added_ut_error_1=$? if [ "$added_ut_error" != 0 ] && [ "$added_ut_error_1" != 0 ];then echo "========================================" @@ -2826,7 +2845,9 @@ set +x rerun_ut_endTime_s=`date +%s` echo "ipipe_log_param_Rerun_TestCases_Total_Time: $[ $rerun_ut_endTime_s - $rerun_ut_startTime_s ]s" echo "ipipe_log_param_Rerun_TestCases_Total_Time: $[ $rerun_ut_endTime_s - $rerun_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt - cp $PADDLE_ROOT/build/Testing/Temporary/CTestCostData.txt ${cfs_dir}/coverage/${AGILE_PULL_ID}/${AGILE_REVISION}/ + if [ "$WITH_ROCM" != "ON" ];then + cp $PADDLE_ROOT/build/Testing/Temporary/CTestCostData.txt ${cfs_dir}/coverage/${AGILE_PULL_ID}/${AGILE_REVISION}/ + fi if [[ "$EXIT_CODE" != "0" ]]; then show_ut_retry_result fi @@ -3488,7 +3509,6 @@ function build_document_preview() { sh /paddle/tools/document_preview.sh ${PORT} } - # origin name: example function exec_samplecode_test() { if [ -d "${PADDLE_ROOT}/build/pr_whl" ];then @@ -3502,10 +3522,10 @@ function exec_samplecode_test() { cd ${PADDLE_ROOT}/tools if [ "$1" = "cpu" ] ; then - python sampcd_processor.py --debug --mode cpu; example_error=$? + python sampcd_processor.py --mode cpu; example_error=$? elif [ "$1" = "gpu" ] ; then SAMPLE_CODE_EXEC_THREADS=${SAMPLE_CODE_EXEC_THREADS:-2} - python sampcd_processor.py --threads=${SAMPLE_CODE_EXEC_THREADS} --debug --mode gpu; example_error=$? + python sampcd_processor.py --threads=${SAMPLE_CODE_EXEC_THREADS} --mode gpu; example_error=$? fi if [ "$example_error" != "0" ];then echo "Code instance execution failed" >&2 @@ -3513,6 +3533,75 @@ function exec_samplecode_test() { fi } +function need_type_checking() { + set +x + + # check pr title + TITLE_CHECK=`curl -s https://github.com/PaddlePaddle/Paddle/pull/${GIT_PR_ID} | grep "" | grep -i "typing" || true` + + if [[ ${TITLE_CHECK} ]]; then + set -x + return 0 + else + set -x + return 1 + fi +} + +function exec_type_checking() { + if [ -d "${PADDLE_ROOT}/build/pr_whl" ];then + pip install ${PADDLE_ROOT}/build/pr_whl/*.whl + else + echo "WARNING: PR wheel is not found. Use develop wheel !!!" + pip install ${PADDLE_ROOT}/build/python/dist/*.whl + fi + + python -c "import paddle;print(paddle.__version__);paddle.version.show()" + + cd ${PADDLE_ROOT}/tools + + # check all sample code + TITLE_CHECK_ALL=`curl -s https://github.com/PaddlePaddle/Paddle/pull/${GIT_PR_ID} | grep "<title>" | grep -i "typing all" || true` + + if [[ ${TITLE_CHECK_ALL} ]]; then + python type_checking.py --full-test; type_checking_error=$? + else + python type_checking.py; type_checking_error=$? + fi + + if [ "$type_checking_error" != "0" ];then + echo "Example code type checking failed" >&2 + exit 5 + fi +} + + +function exec_samplecode_checking() { + example_info_gpu="" + example_code_gpu=0 + if [ "${WITH_GPU}" == "ON" ] ; then + { example_info_gpu=$(exec_samplecode_test gpu 2>&1 1>&3 3>/dev/null); } 3>&1 + example_code_gpu=$? + fi + { example_info=$(exec_samplecode_test cpu 2>&1 1>&3 3>/dev/null); } 3>&1 + example_code=$? + + # TODO(megemini): type_checkding should be default after type annotation been done. + need_type_checking + type_checking_status=$? + + if [[ ${type_checking_status} -eq 0 ]]; then + { type_checking_info=$(exec_type_checking 2>&1 1>&3 3>/dev/null); } 3>&1 + type_checking_code=$? + fi + + summary_check_example_code_problems $[${example_code_gpu} + ${example_code}] "${example_info_gpu}\n${example_info}" + + if [[ ${type_checking_status} -eq 0 ]]; then + summary_type_checking_problems $type_checking_code "$type_checking_info" + fi +} + function collect_ccache_hits() { ccache -s @@ -3553,10 +3642,11 @@ function test_model_benchmark() { bash ${PADDLE_ROOT}/tools/test_model_benchmark.sh } -function summary_check_problems() { +function summary_check_example_code_problems() { set +x local example_code=$1 local example_info=$2 + if [ $example_code -ne 0 ];then echo "===============================================================================" echo "*****Example code error***** Please fix the error listed in the information:" @@ -3579,6 +3669,33 @@ function summary_check_problems() { } +function summary_type_checking_problems() { + set +x + local type_checking_code=$1 + local type_checking_info=$2 + + if [ $type_checking_code -ne 0 ];then + echo "===============================================================================" + echo "*****Example code type checking error***** Please fix the error listed in the information:" + echo "===============================================================================" + echo "$type_checking_info" + echo "===============================================================================" + echo "*****Example code type checking FAIL*****" + echo "===============================================================================" + exit $type_checking_code + else + echo "===============================================================================" + echo "*****Example code type checking info*****" + echo "===============================================================================" + echo "$type_checking_info" + echo "===============================================================================" + echo "*****Example code type checking PASS*****" + echo "===============================================================================" + fi + set -x +} + + function reuse_so_cache() { get_html="https://api.github.com/repos/PaddlePaddle/Paddle" curl -X GET ${get_html}/commits -H "authorization: token ${GITHUB_API_TOKEN}" >tmp.txt @@ -3631,7 +3748,10 @@ function build_pr_and_develop() { fi mv ${PADDLE_ROOT}/dist/*.whl ${PADDLE_ROOT}/build/python/dist/ cmake_change=`git diff --name-only upstream/$BRANCH | grep "cmake/external" || true` + # Temporarily save some scripts from PR branch cp ${PADDLE_ROOT}/python/requirements.txt /tmp + cp ${PADDLE_ROOT}/tools/print_signatures.py /tmp + generate_api_spec "$1" "PR" mkdir ${PADDLE_ROOT}/build/pr_whl && cp ${PADDLE_ROOT}/build/python/dist/*.whl ${PADDLE_ROOT}/build/pr_whl rm -f ${PADDLE_ROOT}/build/python/dist/*.whl && rm -f ${PADDLE_ROOT}/build/python/build/.timestamp @@ -4262,15 +4382,7 @@ function main() { check_sequence_op_unittest generate_api_spec ${PYTHON_ABI:-""} "PR" set +e - example_info_gpu="" - example_code_gpu=0 - if [ "${WITH_GPU}" == "ON" ] ; then - { example_info_gpu=$(exec_samplecode_test gpu 2>&1 1>&3 3>/dev/null); } 3>&1 - example_code_gpu=$? - fi - { example_info=$(exec_samplecode_test cpu 2>&1 1>&3 3>/dev/null); } 3>&1 - example_code=$? - summary_check_problems $[${example_code_gpu} + ${example_code}] "${example_info_gpu}\n${example_info}" + exec_samplecode_checking assert_api_spec_approvals ;; build_and_check_cpu) @@ -4282,15 +4394,7 @@ function main() { ;; build_and_check_gpu) set +e - example_info_gpu="" - example_code_gpu=0 - if [ "${WITH_GPU}" == "ON" ] ; then - { example_info_gpu=$(exec_samplecode_test gpu 2>&1 1>&3 3>/dev/null); } 3>&1 - example_code_gpu=$? - fi - { example_info=$(exec_samplecode_test cpu 2>&1 1>&3 3>/dev/null); } 3>&1 - example_code=$? - summary_check_problems $[${example_code_gpu} + ${example_code}] "${example_info_gpu}\n${example_info}" + exec_samplecode_checking assert_api_spec_approvals ;; check_whl_size) @@ -4395,6 +4499,9 @@ function main() { export FLAGS_PIR_OPTEST=True parallel_test true ;; + hyg_dcu_test) + parallel_test + ;; nv_cicheck_coverage) parallel_test nv_test @@ -4416,10 +4523,6 @@ function main() { build ${parallel_number} run_brpc_test ;; - assert_api) - generate_upstream_develop_api_spec ${PYTHON_ABI:-""} ${parallel_number} - assert_api_spec_approvals - ;; test_inference) PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )" if [ "${WITH_PYTHON}" == "OFF" ] ; then @@ -4449,9 +4552,6 @@ function main() { gen_fluid_lib ${parallel_number} test_fluid_lib_train ;; - assert_api_approvals) - assert_api_spec_approvals - ;; assert_file_approvals) assert_file_diff_approvals ;; @@ -4533,11 +4633,6 @@ function main() { build ${parallel_number} build_document_preview ;; - api_example) - { example_info=$(exec_samplecode_test cpu 2>&1 1>&3 3>/dev/null); } 3>&1 - example_code=$? - summary_check_problems $example_code "$example_info" - ;; test_op_benchmark) test_op_benchmark ;; diff --git a/paddle/scripts/windows_build/build.bat b/paddle/scripts/windows_build/build.bat index 0aeacfef7f9bd..4ffec08e666e2 100644 --- a/paddle/scripts/windows_build/build.bat +++ b/paddle/scripts/windows_build/build.bat @@ -1,5 +1,5 @@ @ECHO OFF -SETLOCAL +SETLOCAL set source_path=%1 set PYTHON_DIR=%2 set WITH_GPU=%3 diff --git a/pyproject.toml b/pyproject.toml index 4a4a5a73c5fda..0391f1bf823f4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -131,3 +131,32 @@ known-first-party = ["paddle"] "test/dygraph_to_static/test_loop.py" = ["C416", "F821"] # Ignore unnecessary lambda in dy2st unittest test_lambda "test/dygraph_to_static/test_lambda.py" = ["PLC3002"] + +[tool.mypy] +python_version = "3.8" +cache_dir = ".mypy_cache" +# Miscellaneous strictness flags +allow_redefinition = true +local_partial_types = true +strict = false +# Untyped definitions and calls +check_untyped_defs = true +# Import discovery +follow_imports = "normal" +# Miscellaneous +warn_unused_configs = true +# Configuring warnings +warn_redundant_casts = true +warn_unused_ignores = true +warn_no_return = true +# Configuring error messages +show_column_numbers = true + +[[tool.mypy.overrides]] +module = [ + "astor", + "cv2", + "scipy", + "xlsxwriter" +] +ignore_missing_imports = true diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index b3029a24309cf..16501a254f280 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -25,7 +25,7 @@ set(PY_FILES paddle/__init__.py ${UTILS_PY_FILES} ${FLUID_PY_FILES}) if(WITH_GPU) set(PACKAGE_NAME "paddlepaddle-gpu") elseif(WITH_ROCM) - set(PACKAGE_NAME "paddlepaddle-rocm") + set(PACKAGE_NAME "paddlepaddle-dcu") elseif(WITH_XPU) set(PACKAGE_NAME "paddlepaddle-xpu") elseif(WITH_IPU) @@ -173,17 +173,10 @@ endif() add_custom_target(paddle_python ALL DEPENDS ${PADDLE_PYTHON_BUILD_DIR}/.timestamp) + if(BUILD_WHL_PACKAGE AND NOT WITH_SETUP_INSTALL) - add_custom_target( - paddle_copy ALL - # generate tensor.pyi for type hints - COMMAND - ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python - ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/gen_tensor_stub.py - --input-file - ${PADDLE_SOURCE_DIR}/python/paddle/tensor/tensor.prototype.pyi - --output-file ${PADDLE_BINARY_DIR}/python/paddle/tensor/tensor.pyi - DEPENDS ${PADDLE_PYTHON_BUILD_DIR}/.timestamp_wheel) + add_custom_target(paddle_copy ALL + DEPENDS ${PADDLE_PYTHON_BUILD_DIR}/.timestamp_wheel) add_dependencies(paddle_copy paddle_python) endif() diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 0cd36f299ecd6..37409b626009b 100644 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -14,12 +14,16 @@ import typing +__is_metainfo_generated = False try: from paddle.cuda_env import * # noqa: F403 from paddle.version import ( # noqa: F401 commit as __git_commit__, full_version as __version__, ) + + __is_metainfo_generated = True + except ImportError: import sys @@ -272,6 +276,7 @@ atleast_1d, atleast_2d, atleast_3d, + block_diag, broadcast_tensors, broadcast_to, cast, @@ -433,6 +438,7 @@ inner, inverse, isfinite, + isin, isinf, isnan, isneginf, @@ -577,8 +583,7 @@ if os.path.exists(cuh_file): os.environ.setdefault('runtime_include_dir', runtime_include_dir) - -if is_compiled_with_cuda(): +if __is_metainfo_generated and is_compiled_with_cuda(): import os import platform @@ -679,7 +684,9 @@ ctypes.CDLL('msvcp140.dll') ctypes.CDLL('vcruntime140_1.dll') except OSError: - print( + import logging + + logging.error( '''Microsoft Visual C++ Redistributable is not installed, this may lead to the DLL load failure. It can be downloaded at https://aka.ms/vs/16/release/vc_redist.x64.exe''' ) @@ -699,7 +706,6 @@ path_patched = False for dll in dlls: is_loaded = False - print("dll:", dll) if with_load_library_flags: res = kernel32.LoadLibraryExW(dll, None, 0x00001100) last_error = ctypes.get_last_error() @@ -733,6 +739,7 @@ ir_guard._switch_to_pir() __all__ = [ + 'block_diag', 'iinfo', 'finfo', 'dtype', @@ -846,6 +853,7 @@ 'squeeze_', 'to_tensor', 'gather_nd', + 'isin', 'isinf', 'isneginf', 'isposinf', diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py index 9ae60e5185ee0..34318f3cc9183 100644 --- a/python/paddle/amp/auto_cast.py +++ b/python/paddle/amp/auto_cast.py @@ -251,7 +251,7 @@ def _pir_transform(t, dtype): param = op.operand(0).source() cast_param = paddle.cast(param, dtype) cast_param.persistable = True - paddle._pir_ops.updata_parameter(cast_param, t.name) + paddle._pir_ops.update_parameter(cast_param, t.name) block.remove_op(op) break main.set_parameters_from(startup) diff --git a/python/paddle/autograd/backward_utils.py b/python/paddle/autograd/backward_utils.py index 0649c3e19bf05..49863ec16363a 100644 --- a/python/paddle/autograd/backward_utils.py +++ b/python/paddle/autograd/backward_utils.py @@ -29,20 +29,21 @@ # TODO: Consider a better way to mark these ops has no grad op. # Such as use a new trait to mark these ops. +# Please keep them as alphabetical order. ALLOW_NO_GRAD_OPS = [ # Compare ops "pd_op.equal", "pd_op.equal_", - "pd_op.not_equal", - "pd_op.not_equal_", - "pd_op.less_than", - "pd_op.less_than_", - "pd_op.less_equal", - "pd_op.less_equal_", "pd_op.greater_than", "pd_op.greater_than_", "pd_op.greater_equal", "pd_op.greater_equal_", + "pd_op.less_than", + "pd_op.less_than_", + "pd_op.less_equal", + "pd_op.less_equal_", + "pd_op.not_equal", + "pd_op.not_equal_", # Logical ops "pd_op.logical_and", "pd_op.logical_and_", @@ -67,35 +68,39 @@ "pd_op.bitwise_xor_", # Array ops "pd_op.assign_array", - "pd_op.array_length", - "pd_op.slice_array", - "pd_op.slice_array_dense", - "pd_op.assign_array", "pd_op.assign_array_", - "pd_op.create_array", - "pd_op.create_array_like", + "pd_op.array_length", + "pd_op.array_pop", "pd_op.array_read", "pd_op.array_write_", - "pd_op.array_pop", + "pd_op.create_array", + "pd_op.create_array_like", + "pd_op.slice_array", + "pd_op.slice_array_dense", # Others - "pd_op.remainder", - "pd_op.argmax", - "pd_op.print", "pd_op.accuracy", - "pd_op.randint", - "pd_op.uniform", - "pd_op.gaussian", + "pd_op.all", + "pd_op.any", + "pd_op.argmax", + "pd_op.assign_value_", "pd_op.bernoulli", + "pd_op.distribute_fpn_proposals", + "pd_op.floor_divide", "pd_op.full_like", - "pd_op.assign_value_", - "pd_op.nextafter", + "pd_op.full_with_tensor", + "pd_op.gaussian", "pd_op.isnan", "pd_op.isinf", - "pd_op.all", - "pd_op.any", + "pd_op.nextafter", + "pd_op.nonzero", + "pd_op.one_hot", + "pd_op.print", "pd_op.prior_box", + "pd_op.randint", + "pd_op.remainder", + "pd_op.shape", "pd_op.share_data_", - "pd_op.floor_divide", + "pd_op.uniform", ] diff --git a/python/paddle/base/core.py b/python/paddle/base/core.py index 4b00161bc3c82..f412a954c0bb0 100644 --- a/python/paddle/base/core.py +++ b/python/paddle/base/core.py @@ -506,7 +506,11 @@ def _test_use_sync(value): # ops in forward_blacklist will not be replaced by composite ops. -prim_config = {"forward_blacklist": set(), "composite_ops_record": set()} +prim_config = { + "forward_blacklist": set(), + "composite_ops_record": set(), + "backward_blacklist": set(), +} def _get_batch_norm_none_var(op): @@ -588,6 +592,7 @@ def _reset_prim_forward_blacklist(): def _set_prim_backward_blacklist(*args): ops = set(args) for item in ops: + prim_config["backward_blacklist"].add(item) if not isinstance(item, str): raise TypeError("all items in set must belong to string") _set_bwd_prim_blacklist(ops) @@ -671,3 +676,15 @@ def _check_and_set_prim_vjp_skip_default_ops(): _check_and_set_prim_vjp_skip_default_ops() + + +def _check_prim_vjp_ops(): + ops_org = os.getenv("FLAGS_prim_backward_blacklist", "") + if ops_org: + ops = [] + for item in ops_org.split(";"): + ops.append(item.strip()) + _set_prim_backward_blacklist(*ops) + + +_check_prim_vjp_ops() diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py index fcd69d0fd65d1..4b62b57f4e806 100644 --- a/python/paddle/base/framework.py +++ b/python/paddle/base/framework.py @@ -33,7 +33,6 @@ import numpy as np import paddle -import paddle.version as paddle_version from .. import pir from . import core, unique_name @@ -573,10 +572,10 @@ def require_version(min_version, max_version=None): ) version_installed = [ - paddle_version.major, - paddle_version.minor, - paddle_version.patch, - paddle_version.rc, + paddle.version.major, + paddle.version.minor, + paddle.version.patch, + paddle.version.rc, ] zero_version = ["0", "0", "0", "0"] @@ -591,13 +590,13 @@ def version_cmp(ver_a, ver_b): if version_cmp(version_installed, zero_version) == 0: if max_version is not None: warnings.warn( - f"PaddlePaddle version in [{min_version}, {max_version}] required, but {paddle_version.full_version} installed. " + f"PaddlePaddle version in [{min_version}, {max_version}] required, but {paddle.version.full_version} installed. " "Maybe you are using a develop version, " "please make sure the version is good with your code." ) else: warnings.warn( - f"PaddlePaddle version {min_version} or higher is required, but {paddle_version.full_version} installed, " + f"PaddlePaddle version {min_version} or higher is required, but {paddle.version.full_version} installed, " "Maybe you are using a develop version, " "please make sure the version is good with your code." ) @@ -619,12 +618,12 @@ def version_cmp(ver_a, ver_b): or version_cmp(version_installed, min_version_to_check) < 0 ): raise Exception( - f"VersionError: PaddlePaddle version in [{min_version}, {max_version}] required, but {paddle_version.full_version} installed." + f"VersionError: PaddlePaddle version in [{min_version}, {max_version}] required, but {paddle.version.full_version} installed." ) else: if version_cmp(version_installed, min_version_to_check) < 0: raise Exception( - f"VersionError: PaddlePaddle version {min_version} or higher is required, but {paddle_version.full_version} installed, " + f"VersionError: PaddlePaddle version {min_version} or higher is required, but {paddle.version.full_version} installed, " f"please upgrade your PaddlePaddle to {min_version} or other higher version." ) @@ -1617,6 +1616,9 @@ def __init__( if name is None: name = self.block.program._name_generator("_generated_var") + while self.block._find_var_recursive(name) is not None: + name = self.block.program._name_generator("_generated_var") + if dtype is not None: dtype = convert_to_proto_type(dtype) diff --git a/python/paddle/decomposition/decomp.py b/python/paddle/decomposition/decomp.py index 6ffaebe444c9d..ab06767768271 100644 --- a/python/paddle/decomposition/decomp.py +++ b/python/paddle/decomposition/decomp.py @@ -850,13 +850,15 @@ def decompose_dist_program(pir_program): decompose(pir_program, []) # decomp backward ops + blacklist = core.prim_config["backward_blacklist"] + block = pir_program.global_block() + pre_combine_op = None with paddle.pir.core.program_guard(pir_program): ops = pir_program.global_block().ops for op in ops: bwd_op_name = op.name() - # todo(CZ): to be removed - if bwd_op_name in ["pd_op.mean_grad", "pd_op.concat_grad"]: + if bwd_op_name.split(".")[-1] in blacklist: continue skip_decomp = False if has_decomp_vjp(op): @@ -867,13 +869,45 @@ def decompose_dist_program(pir_program): if not skip_decomp: pir.set_insertion_point(op) orig_outs = op.results() + + is_next_split = False decomp_outs = call_decomp_vjp(op) - new_outs = _analyse_decomp_results( - orig_outs, decomp_outs, op - ) - op.replace_all_uses_with(new_outs) + for i in range(len(orig_outs)): + if orig_outs[i].has_one_use(): + next_op = orig_outs[i].first_use().owner() + if next_op.name() == "builtin.split": + is_next_split = True + _check_op_results( + next_op.name(), + next_op.results(), + decomp_outs[i], + ) + next_op.replace_all_uses_with(decomp_outs[i]) + block.remove_op(next_op) + + if not is_next_split: + new_outs = _analyse_decomp_results( + orig_outs, decomp_outs, op + ) + _check_op_results(op.name(), orig_outs, new_outs) + op.replace_all_uses_with(new_outs) + block.remove_op(op) + if op.name() == "builtin.combine": + pre_combine_op = op + + if pre_combine_op is not None: + remove_op = True + for item in pre_combine_op.results(): + if item.has_one_use(): + remove_op = False + break + if remove_op: + block.remove_op(pre_combine_op) + pre_combine_op = None + paddle.pir.set_insertion_point_to_block_end(block) + def decompose_pir_program(pir_program, param_mapping, grad_var_to_var): ''' diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py index 7c2439a059a34..7faa92607719c 100644 --- a/python/paddle/distributed/auto_parallel/static/engine.py +++ b/python/paddle/distributed/auto_parallel/static/engine.py @@ -58,6 +58,7 @@ from .pir_pass import ( apply_partition_pass, apply_reshard_pass, + remove_other_rank_op_pass, remove_unuseful_comm_op_pass, ) from .planner_v2 import Planner @@ -696,6 +697,8 @@ def _parallel_pir(self, mode): # collect the communicator created during resolution. apply_reshard_pass(dist_program) + remove_other_rank_op_pass(dist_program) + # Part 4: Optimization Pass # NOTE Only those Optimization Pass that related to Parallelism (need dist attr) should be placed here and all the Pass should be Optional. diff --git a/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py index 130e80212f274..f5df914650c2c 100644 --- a/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py +++ b/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py @@ -478,9 +478,12 @@ def _apply_post_optimization( self._strategy.gradient_merge.avg = True # gradient_merge is then train-only optimization + grad_to_global_grad = {} if self.is_train and self._strategy.gradient_merge.enable: config = copy.deepcopy(self._strategy.gradient_merge.to_dict()) config["dist_context"] = self._dist_context + config["grad_to_global_grad"] = grad_to_global_grad + config["pipeline_mode"] = self._strategy.pipeline.schedule_mode if gradient_sync_after_accumulate: config["params_grads"] = global_params_grads config[ @@ -557,4 +560,5 @@ def _apply_post_optimization( "vpp_degree": self._strategy.pipeline.vpp_degree, "dist_context": self._dist_context, "split_backward": self._strategy.pipeline.split_backward, + "grad_to_global_grad": grad_to_global_grad, } diff --git a/python/paddle/distributed/auto_parallel/static/pir_pass.py b/python/paddle/distributed/auto_parallel/static/pir_pass.py index cae150f556967..6597aebb2f9de 100644 --- a/python/paddle/distributed/auto_parallel/static/pir_pass.py +++ b/python/paddle/distributed/auto_parallel/static/pir_pass.py @@ -89,44 +89,6 @@ def apply_partition_pass(program): var.replace_all_uses_with(reshard_var) reshard_var.get_defining_op().operand(0).set_source(var) - # pruning op and value not belong to cur rank - cur_rank = paddle.distributed.get_rank() - for op in program.global_block().ops[::-1]: - if op.name() in partition_skip_op_list: - can_delete = True - for val in op.results(): - if not val.use_empty(): - can_delete = False - if can_delete: - op.erase() - continue - if cur_rank not in op.dist_attr.process_mesh.process_ids: - op.erase() - else: - # set the operand as null when it is not belong to cur rank - if ( - op.name() == 'dist_op.reshard' - and cur_rank - not in op.operand(0) - .source() - .dist_attr() - .process_mesh.process_ids - ): - op.operand(0).set_source(None) - - # merge pd.data ops for - lr_ops = [] - for op in program.global_block().ops[::-1]: - if op.name() == 'pd_op.data' and "learning_rate" in op.attrs()["name"]: - lr_ops.append(op) - - if len(lr_ops) > 1: - lr_value = lr_ops[0].result(0) - for op in lr_ops[1:]: - lr = op.result(0) - lr.replace_all_uses_with(lr_value) - op.erase() - def apply_reshard_pass(program): for op in program.global_block().ops: @@ -160,6 +122,40 @@ def apply_reshard_pass(program): op.erase() +# pruning op and value not belong to cur rank +def remove_other_rank_op_pass(dist_program): + cur_rank = paddle.distributed.get_rank() + for op in dist_program.global_block().ops[::-1]: + if op.name() in partition_skip_op_list: + can_delete = True + for val in op.results(): + if not val.use_empty(): + can_delete = False + if can_delete: + op.erase() + continue + if cur_rank not in op.dist_attr.process_mesh.process_ids: + op.erase() + elif op.name() == "dist_op.reshard": + assert op.result( + 0 + ).use_empty(), f'There should not have useful dist.reshard op in remove_other_rank_op_pass. but find : {op}' + op.erase() + + # merge pd.data ops for + lr_ops = [] + for op in dist_program.global_block().ops[::-1]: + if op.name() == 'pd_op.data' and "learning_rate" in op.attrs()["name"]: + lr_ops.append(op) + + if len(lr_ops) > 1: + lr_value = lr_ops[0].result(0) + for op in lr_ops[1:]: + lr = op.result(0) + lr.replace_all_uses_with(lr_value) + op.erase() + + # Note: this is the pass in the dense program comm_ops = ["pd_op.c_allreduce_sum_", "pd_op.c_allgather"] diff --git a/python/paddle/distributed/auto_parallel/static/reshard_funcs/nd_mesh_reshard_func.py b/python/paddle/distributed/auto_parallel/static/reshard_funcs/nd_mesh_reshard_func.py index cf4b9b7b32af1..bbc9b959b72db 100644 --- a/python/paddle/distributed/auto_parallel/static/reshard_funcs/nd_mesh_reshard_func.py +++ b/python/paddle/distributed/auto_parallel/static/reshard_funcs/nd_mesh_reshard_func.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import copy import paddle import paddle.distributed as dist @@ -67,10 +66,12 @@ def get_1D_sub_process_mesh(process_mesh, mesh_dim): process_ids = np.array(process_mesh.process_ids).reshape(mesh_shape) rank_id = dist.get_rank() + # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism + if rank_id not in process_mesh.process_ids: + rank_id = process_mesh.process_ids[0] coord = list(np.where(process_ids == rank_id)) coord[mesh_dim] = range(mesh_shape[mesh_dim]) sub_process_ids = process_ids[tuple(coord)].flatten() - sub_mesh_shape = sub_process_ids.shape sub_mesh_name = dim_names[mesh_dim] return dist.ProcessMesh(sub_process_ids, [sub_mesh_name]) @@ -106,35 +107,31 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type): first_diff_axis = find_first_diff_shard_axis( src_dist_attr, dst_dist_attr ) - ori_dst_dist_attr = copy_dist_attr_with_new_member(dst_dist_attr) - out_value = src_value # intermediate result - src_type = src_value.type() + # out_value = src_value # intermediate result + # src_type = src_value.type() tensor_ndim = len(src_value.shape) process_mesh = dst_dist_attr.process_mesh # Step2. Convert the non-replicated dimensions to replicated. # Step2.1. convert partial status to replicated - real_out_dist_attr = copy_dist_attr_with_new_member(src_dist_attr) if is_partial(src_dist_attr): - in_partial_status = copy.deepcopy(src_dist_attr.partial_status) + in_partial_status = src_dist_attr.partial_status out_partial_status = dst_dist_attr.partial_status # read-only # convert each partial dim to replicated with corresponding # 1-D mesh function for partial_dim, partial_type in in_partial_status.items(): - if ( - partial_dim in out_partial_status - or partial_dim in ori_dst_dist_attr.dims_mapping - ): + if partial_dim in out_partial_status: continue # get the partial status after converting - real_out_partial_status = copy.deepcopy( - real_out_dist_attr.partial_status + tmp_partial_status = src_dist_attr.partial_status + tmp_partial_status.pop(partial_dim) + tmp_dst_dist_attr = copy_dist_attr_with_new_member( + src_dist_attr, + new_partial_status=tmp_partial_status, ) - real_out_partial_status.pop(partial_dim) - real_out_dist_attr = copy_dist_attr_with_new_member( - real_out_dist_attr, - new_partial_status=real_out_partial_status, + tmp_dst_type = paddle.base.libpaddle.pir.cvt_to_dist_type( + src_value.type(), tmp_dst_dist_attr ) # get the process_mesh on specific axis @@ -160,28 +157,29 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type): ) one_dim_func = PToRReshardFunction() - out_value = one_dim_func.reshard( + src_value = one_dim_func.reshard( in_one_dim_dist_attr, out_one_dim_dist_attr, - out_value, - src_type, + src_value, + tmp_dst_type, ) - - out_value.update_dist_attr(real_out_dist_attr) + src_dist_attr = tmp_dst_dist_attr # Step2.2 convert shard status to replicated for i in range(first_diff_axis, -1, -1): - in_mesh_axis = real_out_dist_attr.dims_mapping[i] - if in_mesh_axis == -1: + in_mesh_axis = src_dist_attr.dims_mapping[i] + out_mesh_axis = dst_dist_attr.dims_mapping[i] + if in_mesh_axis == -1 or in_mesh_axis == out_mesh_axis: continue # calculate the dist_attr after converting - real_out_dims_mapping = copy.deepcopy( - real_out_dist_attr.dims_mapping + tmp_dims_mapping = src_dist_attr.dims_mapping + tmp_dims_mapping[i] = -1 + tmp_dst_dist_attr = copy_dist_attr_with_new_member( + src_dist_attr, new_dims_mapping=tmp_dims_mapping ) - real_out_dims_mapping[i] = -1 - real_out_dist_attr = copy_dist_attr_with_new_member( - real_out_dist_attr, new_dims_mapping=real_out_dims_mapping + tmp_dst_type = paddle.base.libpaddle.pir.cvt_to_dist_type( + src_value.type(), tmp_dst_dist_attr ) # get the process_mesh on specific axis @@ -205,45 +203,41 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type): ) one_dim_func = SToRReshardFunction() - out_value = one_dim_func.reshard( - in_one_dim_dist_attr, out_one_dim_dist_attr, out_value, src_type + src_value = one_dim_func.reshard( + in_one_dim_dist_attr, + out_one_dim_dist_attr, + src_value, + tmp_dst_type, ) - - out_value.update_dist_attr(real_out_dist_attr) + src_dist_attr = tmp_dst_dist_attr # Step3. Convert the replicated status to the status in dst_dist_attr # Step3.1 convert replicated to partial - if is_partial(ori_dst_dist_attr): - in_partial_status = out_value.dist_attr.partial_status - out_partial_status = ori_dst_dist_attr.partial_status + if is_partial(dst_dist_attr): + in_partial_status = src_dist_attr.partial_status + out_partial_status = dst_dist_attr.partial_status for partial_dim, partial_type in out_partial_status.items(): if partial_dim in in_partial_status: continue - raise NotImplementedError( "RToPReshardFunction is not implemented" ) - # Step3.2 convert replicated/partial to shard + # Step3.2 convert replicated to shard for i in range(first_diff_axis, -1, -1): - out_mesh_axis = ori_dst_dist_attr.dims_mapping[i] - if out_mesh_axis == -1: + in_mesh_axis = src_dist_attr.dims_mapping[i] + out_mesh_axis = dst_dist_attr.dims_mapping[i] + if in_mesh_axis == out_mesh_axis: continue - in_partial_status = out_value.dist_attr().partial_status - need_p2s = out_mesh_axis in in_partial_status - dims_mapping = copy.deepcopy(real_out_dist_attr.dims_mapping) - dims_mapping[i] = out_mesh_axis - partial_status = None - if out_mesh_axis in real_out_dist_attr.partial_status: - partial_status = copy.deepcopy( - real_out_dist_attr.partial_status - ) - partial_status.pop(out_mesh_axis) - real_out_dist_attr = copy_dist_attr_with_new_member( - real_out_dist_attr, - new_dims_mapping=dims_mapping, - new_partial_status=partial_status, + # calculate the dist_attr after converting + tmp_dims_mapping = src_dist_attr.dims_mapping + tmp_dims_mapping[i] = out_mesh_axis + tmp_dst_dist_attr = copy_dist_attr_with_new_member( + src_dist_attr, new_dims_mapping=tmp_dims_mapping + ) + tmp_dst_type = paddle.base.libpaddle.pir.cvt_to_dist_type( + src_value.type(), tmp_dst_dist_attr ) # get the process_mesh on specific axis @@ -265,23 +259,15 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type): sub_mesh, out_one_dim_dims_mapping, {} ) ) - - if need_p2s: - raise NotImplementedError( - "PToSReshardFunction is not implemented" - ) - else: - one_dim_func = RToSReshardFunction() - out_value = one_dim_func.reshard( - in_one_dim_dist_attr, - out_one_dim_dist_attr, - out_value, - dst_type, - ) - out_value.update_dist_attr(real_out_dist_attr) - - out_value.set_type(dst_type) - return out_value + one_dim_func = RToSReshardFunction() + src_value = one_dim_func.reshard( + in_one_dim_dist_attr, + out_one_dim_dist_attr, + src_value, + tmp_dst_type, + ) + src_dist_attr = tmp_dst_dist_attr + return src_value class NdMeshReshardFunctionCrossMesh(ReshardFunction): @@ -310,20 +296,14 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type): tmp_dst_type = paddle.base.libpaddle.pir.cvt_to_dist_type( src_value.type(), tmp_dist_attr ) - out_value = same_status_func.reshard( + src_value = same_status_func.reshard( src_dist_attr, tmp_dist_attr, src_value, tmp_dst_type ) - if out_value is None: - return None - - curr_global_rank = paddle.distributed.get_rank() - if curr_global_rank in dst_dist_attr.process_mesh.process_ids: - nd_mesh_func = NdMeshReshardFunction() - assert nd_mesh_func.is_suitable( - tmp_dist_attr, dst_dist_attr - ), f"Invoke the p to r reshard function is not valid from {tmp_dist_attr} to {dst_dist_attr}" - return nd_mesh_func.reshard( - tmp_dist_attr, dst_dist_attr, out_value, dst_type - ) - return None + nd_mesh_func = NdMeshReshardFunction() + assert nd_mesh_func.is_suitable( + tmp_dist_attr, dst_dist_attr + ), f"Invoke the p to r reshard function is not valid from {tmp_dist_attr} to {dst_dist_attr}" + return nd_mesh_func.reshard( + tmp_dist_attr, dst_dist_attr, src_value, dst_type + ) diff --git a/python/paddle/distributed/auto_parallel/static/reshard_funcs/p_to_r_reshard_func.py b/python/paddle/distributed/auto_parallel/static/reshard_funcs/p_to_r_reshard_func.py index 8956cc2535d9b..d5046ff0f7963 100644 --- a/python/paddle/distributed/auto_parallel/static/reshard_funcs/p_to_r_reshard_func.py +++ b/python/paddle/distributed/auto_parallel/static/reshard_funcs/p_to_r_reshard_func.py @@ -47,7 +47,7 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type): src_reduce_type = ReduceOp.SUM reduce_mean = True - group = new_process_group(src_mesh.process_ids) + group = new_process_group(sorted(src_mesh.process_ids)) reduced_value = paddle._C_ops.c_allreduce_sum_( src_value, group.id, True, False ) @@ -95,20 +95,14 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type): tmp_dst_type = paddle.base.libpaddle.pir.cvt_to_dist_type( src_value.type(), tmp_dist_attr ) - out_value = same_status_func.reshard( + src_value = same_status_func.reshard( src_dist_attr, tmp_dist_attr, src_value, tmp_dst_type ) - if out_value is None: - return None - - curr_global_rank = paddle.distributed.get_rank() - if curr_global_rank in dst_dist_attr.process_mesh.process_ids: - p_to_r_func = PToRReshardFunction() - assert p_to_r_func.is_suitable( - tmp_dist_attr, dst_dist_attr - ), f"Invoke the p to r reshard function is not valid from {tmp_dist_attr} to {dst_dist_attr}" - return p_to_r_func.reshard( - tmp_dist_attr, dst_dist_attr, out_value, dst_type - ) - return None + p_to_r_func = PToRReshardFunction() + assert p_to_r_func.is_suitable( + tmp_dist_attr, dst_dist_attr + ), f"Invoke the p to r reshard function is not valid from {tmp_dist_attr} to {dst_dist_attr}" + return p_to_r_func.reshard( + tmp_dist_attr, dst_dist_attr, src_value, dst_type + ) diff --git a/python/paddle/distributed/auto_parallel/static/reshard_funcs/r_to_s_reshard_func.py b/python/paddle/distributed/auto_parallel/static/reshard_funcs/r_to_s_reshard_func.py index 922df440c5a21..3b54fa4d8a728 100644 --- a/python/paddle/distributed/auto_parallel/static/reshard_funcs/r_to_s_reshard_func.py +++ b/python/paddle/distributed/auto_parallel/static/reshard_funcs/r_to_s_reshard_func.py @@ -59,15 +59,17 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type): out_value = paddle.slice(src_value, [split_axis], [start], [end]) - out_value.set_type(src_value.type()) - out_value.update_dist_attr(dst_dist_attr) + out_value.set_type(dst_type) out_value.get_defining_op().dist_attr = ( paddle.base.libpaddle.pir.create_op_dist_attribute( mesh, [src_dist_attr], [dst_dist_attr] ) ) return out_value - return None + # fake var will be removed in remove_other_rank_op_pass. + fake_var = paddle._C_ops.reshard_v2(src_value, dst_dist_attr) + fake_var.set_type(dst_type) + return fake_var class RToSReshardFunctionCrossMesh(ReshardFunction): diff --git a/python/paddle/distributed/auto_parallel/static/reshard_funcs/s_to_r_reshard_func.py b/python/paddle/distributed/auto_parallel/static/reshard_funcs/s_to_r_reshard_func.py index 5a907839cf78b..42d92392b65c9 100644 --- a/python/paddle/distributed/auto_parallel/static/reshard_funcs/s_to_r_reshard_func.py +++ b/python/paddle/distributed/auto_parallel/static/reshard_funcs/s_to_r_reshard_func.py @@ -102,8 +102,7 @@ def get_split_axis_with_dims_mapping(dims_mapping): return new_value else: # TODO(ywt01) support unbalanced split - pass - return None + raise NotImplementedError("unbalanced split is not implemented") def reshard_s_to_r_with_padding( self, @@ -116,8 +115,8 @@ def reshard_s_to_r_with_padding( ): src_mesh = src_dist_attr.process_mesh num_of_process = len(src_mesh.process_ids) - dtype = src_value.dtype - group = new_process_group(src_mesh.process_ids) + + group = new_process_group(sorted(src_mesh.process_ids)) allgather_value = paddle._C_ops.c_allgather( src_value, group.id, num_of_process, True ) @@ -138,11 +137,32 @@ def reshard_s_to_r_with_padding( if split_axis != 0 or padding_num != 0: allgather_op = allgather_value.get_defining_op() - paddle.pir.set_insertion_point_after(allgather_op) - split_value = paddle._C_ops.split_with_num( + split_values = paddle._C_ops.split_with_num( allgather_op.result(0), num_of_process, 0 ) - concat_value = paddle._C_ops.concat(split_value, split_axis) + builtin_split_op = split_values[0].get_defining_op() + pd_splite_op = builtin_split_op.operand_source(0).get_defining_op() + + # fix the split_with_num dist attribtue. + new_inner_types = [] + for sub_value in split_values: + new_inner_type = paddle.base.libpaddle.pir.cvt_to_dist_type( + sub_value.type(), allgather_value.dist_attr() + ) + new_inner_types.append(new_inner_type) + sub_value.set_type(new_inner_type) + vec_type = paddle.base.libpaddle.pir.create_vec_type( + new_inner_types + ) + pd_splite_op.result(0).set_type(vec_type) + + concat_value = paddle._C_ops.concat(split_values, split_axis) + # fold builtin.split op and builtin.combine op + concat_op = concat_value.get_defining_op() + builtin_combine_op = concat_op.operand_source(0).get_defining_op() + concat_op.operand(0).set_source(pd_splite_op.result(0)) + builtin_combine_op.erase() + builtin_split_op.erase() return concat_value return allgather_value @@ -183,16 +203,11 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type): out_value = same_status_func.reshard( src_dist_attr, tmp_dist_attr, src_value, tmp_dst_type ) - if out_value is None: - return None - - curr_global_rank = paddle.distributed.get_rank() - if curr_global_rank in dst_dist_attr.process_mesh.process_ids: - s_to_r_func = SToRReshardFunction() - assert s_to_r_func.is_suitable( - tmp_dist_attr, dst_dist_attr - ), f"Invoke the p to r reshard function is not valid from {tmp_dist_attr} to {dst_dist_attr}" - return s_to_r_func.reshard( - tmp_dist_attr, dst_dist_attr, out_value, dst_type - ) - return None + + s_to_r_func = SToRReshardFunction() + assert s_to_r_func.is_suitable( + tmp_dist_attr, dst_dist_attr + ), f"Invoke the p to r reshard function is not valid from {tmp_dist_attr} to {dst_dist_attr}" + return s_to_r_func.reshard( + tmp_dist_attr, dst_dist_attr, out_value, dst_type + ) diff --git a/python/paddle/distributed/auto_parallel/static/reshard_funcs/same_status_reshard_func.py b/python/paddle/distributed/auto_parallel/static/reshard_funcs/same_status_reshard_func.py index ceae2e7424fd6..db6ec8d1df238 100644 --- a/python/paddle/distributed/auto_parallel/static/reshard_funcs/same_status_reshard_func.py +++ b/python/paddle/distributed/auto_parallel/static/reshard_funcs/same_status_reshard_func.py @@ -87,11 +87,14 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type): dst_mesh, [], [dst_dist_attr] ) ) - recv_value.update_dist_attr(dst_dist_attr) + recv_value.set_type(dst_type) is_send = False break if is_send: - return None + # fake var will be removed in remove_other_rank_op_pass. + fake_var = paddle._C_ops.reshard_v2(src_value, dst_dist_attr) + fake_var.set_type(dst_type) + return fake_var else: return recv_value diff --git a/python/paddle/distributed/auto_tuner/utils.py b/python/paddle/distributed/auto_tuner/utils.py index 741120f7fe598..2db4cb6e0bdcc 100644 --- a/python/paddle/distributed/auto_tuner/utils.py +++ b/python/paddle/distributed/auto_tuner/utils.py @@ -1068,7 +1068,10 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg): prefix + str(cfg[arg]) if prefix else cfg[arg] ) json.dump(cmd_cfg, open(cmd[arg][0], "w")) - if tuner_cfg["run_cmd"].get("generate_launch_cfg", True): + if ( + tuner_cfg["run_cmd"].get("generate_launch_cfg", True) + and not run_best + ): new_cmd_apth = ( os.path.splitext(cmd[arg][0])[0] + "_" @@ -1107,7 +1110,10 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg): prefix + str(cfg[arg]) if prefix else cfg[arg] ) yaml.dump(cmd_cfg, open(cmd[arg][0], "w")) - if tuner_cfg["run_cmd"].get("generate_launch_cfg", True): + if ( + tuner_cfg["run_cmd"].get("generate_launch_cfg", True) + and not run_best + ): new_cmd_apth = ( os.path.splitext(cmd[arg][0])[0] + cfg["log_dir_name"] @@ -1157,7 +1163,10 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg): else: cmd_cfg[keys[-1]] = rr_values json.dump(cmd_cfg, open(cmd[arg][0], "w")) - if tuner_cfg["run_cmd"].get("generate_launch_cfg", True): + if ( + tuner_cfg["run_cmd"].get("generate_launch_cfg", True) + and not run_best + ): new_cmd_apth = ( os.path.splitext(cmd[arg][0])[0] + cfg["log_dir_name"] @@ -1198,7 +1207,10 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg): else: cmd_cfg[keys[-1]] = rr_values yaml.dump(cmd_cfg, open(cmd[arg][0], "w")) - if tuner_cfg["run_cmd"].get("generate_launch_cfg", True): + if ( + tuner_cfg["run_cmd"].get("generate_launch_cfg", True) + and not run_best + ): new_cmd_apth = ( os.path.splitext(cmd[arg][0])[0] + cfg["log_dir_name"] diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py index 63f76416142c1..ba4c61a1f917a 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py @@ -812,6 +812,7 @@ def copy_attr(attr_name): copy_attr("optimize_attr") copy_attr("do_model_average") copy_attr("need_clip") + copy_attr("no_sync") self._slice_params[param.name] = slice_param return slice_param diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py index 33b8c3d95d582..db8c2f7b9b820 100644 --- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py @@ -24,6 +24,7 @@ paddle.float32: "float32", paddle.float64: "float64", paddle.bfloat16: "bfloat16", + paddle.bool: "bool", } PADDLE_TO_NUMBER = { @@ -33,6 +34,7 @@ paddle.int32: 3, paddle.int64: 4, paddle.bfloat16: 5, + paddle.bool: 6, } NUMBER_TO_DTYPE = { @@ -42,6 +44,7 @@ 3: "int32", 4: "int64", 5: "bfloat16", + 6: "bool", } diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index 816af6f91530d..53d929c7890bd 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -451,7 +451,9 @@ def check_layer_sparse(sublayer): return False is_sparse_gradient = [ - check_layer_sparse(sublayer) for sublayer, _ in layers_param + check_layer_sparse(sublayer) + for sublayer, param in layers_param + if not getattr(param, "no_sync", False) ] if in_dynamic_mode(): diff --git a/python/paddle/distributed/passes/allreduce_matmul_grad_overlapping.py b/python/paddle/distributed/passes/allreduce_matmul_grad_overlapping.py index 77affd4cd9c1e..e22cc5bbf6d65 100644 --- a/python/paddle/distributed/passes/allreduce_matmul_grad_overlapping.py +++ b/python/paddle/distributed/passes/allreduce_matmul_grad_overlapping.py @@ -138,8 +138,14 @@ def _split_matmul_grad_and_multi_streaming_allreduce( name: allreduce_op.output(name) for name in allreduce_op_outputs } + # matmul_v2 + reshape + reshape + matmul_v2 + reshape + ... + original c_allreduce_sum + # => + # matmul_v2 + new c_allreduce_sum + reshape + reshape + matmul_v2 + reshape + ... + original c_allreduce_sum + # + # NOTE(liym27): new c_allreduce_sum must be inserted to "the next of the first matmul_v2", otherwise another + # pass fused_linear_param_grad_add will not work. allreduce_op = block._insert_op_without_sync( - index=allreduce_id + 1, + index=matmul_grad_id + 1, type=allreduce_op.type, inputs=allreduce_op_inputs, outputs=allreduce_op_outputs, diff --git a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py index aab9bdb2456a0..2d7413965ae3b 100644 --- a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py +++ b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py @@ -523,6 +523,8 @@ def parse_program( dist_context, ) + return grad_to_gradient_merge + @register_pass("auto_parallel_gradient_merge_pass") class GradientMergePass(PassBase): @@ -550,8 +552,9 @@ def _apply_single_impl(self, main_program, startup_program, context): gradient_sync_after_accumulate = self.get_attr( "gradient_sync_after_accumulate", False ) + grad_to_global_grad = self.get_attr("grad_to_global_grad", {}) with paddle.static.program_guard(main_program, startup_program): - parse_program( + grad_to_merge_grad = parse_program( main_program, startup_program, params_grads, @@ -562,3 +565,5 @@ def _apply_single_impl(self, main_program, startup_program, context): ) main_program._sync_with_cpp() + for k, v in grad_to_merge_grad.items(): + grad_to_global_grad[k] = v diff --git a/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_vpp.py b/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_vpp.py index 4fc9a1ec28692..8bc29411269ab 100644 --- a/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_vpp.py +++ b/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_vpp.py @@ -15,6 +15,10 @@ import logging from paddle.base import core +from paddle.distributed.auto_parallel.static.operators.common import ( + is_data_parallel_reduce_op, + is_data_parallel_scale_op, +) from ...utils.log_utils import get_logger from ..pass_base import register_pass @@ -36,7 +40,8 @@ class PipelineVirtualPipelinePass(PipelinePassBase): def __init__(self): super().__init__() - + self._real_overlap_sharding_reduce = False + self.reduce_comm_suffix = "_reduce" self._forward_micro_step_counter = {} self._backward_micro_step_counter = {} @@ -137,10 +142,22 @@ def _get_virtual_pp_rank(micro_step, forward): if real_split_backward: for chunk_id in range(num_model_chunks - 1, -1, -1): for micro_batch_id in range(0, accumulate_steps): - w_job = core.Job(BACKWARD + "_w" + str(chunk_id)) + if ( + self._real_overlap_sharding_reduce + and micro_batch_id == accumulate_steps - 1 + ): + w_job = core.Job( + BACKWARD + + "_w" + + str(chunk_id) + + self.reduce_comm_suffix + ) + else: + w_job = core.Job(BACKWARD + "_w" + str(chunk_id)) w_job.set_micro_batch_id(micro_batch_id) job_list.append(w_job) - + job_types = [job.type() for job in job_list] + logger.debug(f"The VPP job list: {job_types}") opt_job = core.Job(OPT) job_list.append(opt_job) return job_list @@ -162,6 +179,102 @@ def _split_matmul_grad_ops_to_matmul(self, program, dist_context): block, matmul_grad_id, dist_context=dist_context ) + def _move_sharding_comm_to_backward( + self, types, sub_programs, global_grads + ): + def _get_sharding_comm_op(op, idx, ops): + if is_data_parallel_reduce_op(op): + op_input_names = op.desc.input_arg_names() + op_output_names = op.desc.output_arg_names() + if ( + op_input_names[0] == op_output_names[0] + and op_input_names[0] in global_grads + ): + global_grad_to_comm_op[op_input_names[0]] = [op] + remove_op_ids.append(idx) + + if op.type in ["c_allreduce_sum", "c_reduce_sum"]: + scale_index = idx + 1 + if scale_index < len(len(ops)): + if is_data_parallel_scale_op(ops[scale_index]): + global_grad_to_comm_op[op_input_names[0]].append(op) + remove_op_ids.append(scale_index) + + def _get_scale_op(op, idx): + if is_data_parallel_scale_op(op): + return + if op.type == 'scale': + op_input_names = op.desc.input_arg_names() + op_output_names = op.desc.output_arg_names() + if ( + op_input_names[0] == op_output_names[0] + and op_input_names[0] in global_grads + ): + global_grad_to_scale_op[op_input_names[0]] = op + remove_op_ids.append(idx) + + # 1 get the all sharding_avg in optimizer + type_programs = dict(zip(types, sub_programs)) + opt_program = type_programs["optimizer"] + global_grad_to_comm_op = {} + global_grad_to_scale_op = {} + all_remove_op_ids = [] + for cur_block in opt_program.blocks: + remove_op_ids = [] + for idx, op in enumerate(cur_block.ops): + _get_scale_op(op, idx) + _get_sharding_comm_op(op, idx, cur_block.ops) + all_remove_op_ids.append(remove_op_ids) + if len(global_grad_to_comm_op) == 0: # no need to overlap sharding comm + return False + + # 2 create the new backward(w) with the sharding_comm + new_types = [] + new_programs = [] + for type, sub_program in type_programs.items(): + if "backward_w" in type: + new_program = sub_program.clone() + cur_block = new_program.global_block() + cur_block_scale_op = [] + for idx, op in reversed(list(enumerate(cur_block.ops))): + if op.type == "elementwise_add": + input_arg_names = op.input_arg_names + output_arg_names = op.output_arg_names + if ( + input_arg_names[0] == output_arg_names[0] + and input_arg_names[0] in global_grad_to_comm_op + ): + for origin_op in reversed( + global_grad_to_comm_op[input_arg_names[0]] + ): + new_op = cur_block._insert_op_without_sync( + index=idx + 1, type="nop" + ) + new_op.desc.copy_from(origin_op.desc) + del global_grad_to_comm_op[input_arg_names[0]] + cur_block_scale_op.append( + global_grad_to_scale_op[input_arg_names[0]] + ) + for origin_op in cur_block_scale_op: + new_op = cur_block.append_op(type="nop") + new_op.desc.copy_from(origin_op.desc) + cur_block._sync_with_cpp() + new_types.append(type + self.reduce_comm_suffix) + new_programs.append(new_program) + assert ( + len(global_grad_to_comm_op) == 0 + ), f"global_grad_to_comm_op must be used up, but left: {global_grad_to_comm_op}" + + types.extend(new_types) + sub_programs.extend(new_programs) + + for id, cur_block in enumerate(opt_program.blocks): + for op_id in reversed(all_remove_op_ids[id]): + cur_block._remove_op(op_id) + cur_block._sync_with_cpp() + + return True + def _partial_programs(self, program): dist_context = self.get_attr("dist_context") num_model_chunks = self.get_attr("vpp_degree") @@ -169,7 +282,10 @@ def _partial_programs(self, program): accumulate_steps = self.get_attr("num_micro_batches") num_stages = self.get_attr("pp_degree") split_backward = self.get_attr("split_backward", False) - + grad_to_global_grad = self.get_attr("grad_to_global_grad", {}) + global_grads = [ + global_grad for _, global_grad in grad_to_global_grad.items() + ] if split_backward and accumulate_steps == num_stages: self._split_matmul_grad_ops_to_matmul(program, dist_context) types, sub_program_list = _program_for_vpp_split_bwk( @@ -178,6 +294,11 @@ def _partial_programs(self, program): dist_context, enable_send_recv_overlap, ) + self._real_overlap_sharding_reduce = ( + self._move_sharding_comm_to_backward( + types, sub_program_list, global_grads + ) + ) else: types, sub_program_list = _program_for_vpp( program, diff --git a/python/paddle/distribution/__init__.py b/python/paddle/distribution/__init__.py index 246c4ffb71173..168fbc460d5bd 100644 --- a/python/paddle/distribution/__init__.py +++ b/python/paddle/distribution/__init__.py @@ -34,6 +34,7 @@ from .multivariate_normal import MultivariateNormal from .normal import Normal from .poisson import Poisson +from .student_t import StudentT from .transform import ( # noqa:F401 AbsTransform, AffineTransform, @@ -77,6 +78,7 @@ 'Geometric', 'Binomial', 'Poisson', + 'StudentT', ] __all__.extend(transform.__all__) diff --git a/python/paddle/distribution/student_t.py b/python/paddle/distribution/student_t.py new file mode 100644 index 0000000000000..d1a88887023ff --- /dev/null +++ b/python/paddle/distribution/student_t.py @@ -0,0 +1,277 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from collections.abc import Sequence + +import paddle +from paddle.base.data_feeder import check_type, convert_dtype +from paddle.base.framework import Variable +from paddle.distribution import Gamma, distribution +from paddle.framework import in_dynamic_mode + + +class StudentT(distribution.Distribution): + r""" + The StudentT distribution with parameters: `df`, `loc`, `scale`. + + In probability theory and statistics, the StudentT distribution is one of the basic continuous probability distributions + defined on the real number set. + + The probability density function (pdf) is + + .. math:: + + pdf(x; \nu, \mu, \sigma) = \frac{\Gamma[(\nu+1)/2]}{\sigma\sqrt{\nu\pi}\Gamma(\nu/2)[1+(\frac{x-\mu}{\sigma})^2/\nu]^{(1+\nu)/2}} + + In the above equation: + + * :math:`df = \nu`: is the degree of freedom. + * :math:`loc = \mu`: is the center parameter. + * :math:`scale = \sigma`: is the scale parameter. + * :math:`\Gamma(\cdot)`: is the gamma function. + + Args: + df (float|Tensor): The degree of freedom of the distribution, which should be non-negative. If the input data type is float, + the data type of `df` will be converted to a 1-D Tensor with paddle global default dtype. Supported dtype: float32, float64. + loc (float|Tensor): The center of the distribution. If the input data type is float, the data type of `loc` will be converted to a + 1-D Tensor with paddle global default dtype. Supported dtype: float32, float64. + scale (float|Tensor): The scale of the distribution, which should be non-negative. If the input data type is float, the data type + of `scale` will be converted to a 1-D Tensor with paddle global default dtype. Supported dtype: float32, float64. + name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + + Examples: + .. code-block:: python + + >>> import paddle + >>> from paddle.distribution import StudentT + >>> paddle.set_device('cpu') + >>> paddle.seed(100) + >>> dist = StudentT(df=10.0, loc=0.0, scale=1.0) + >>> dist.sample([3]) + Tensor(shape=[3, 1], dtype=float32, place=Place(cpu), stop_gradient=True, + [[-2.07709980], + [ 0.27981189], + [ 0.00881413]]) + + >>> dist2 = StudentT(df=paddle.to_tensor([10.0, 5.0]), loc=paddle.to_tensor([0.0, 0.0]), scale=paddle.to_tensor([1.0, 2.0])) + >>> value_tensor = paddle.to_tensor([0.8], dtype="float32") + >>> lp = dist2.log_prob(value_tensor) + >>> print(lp) + Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True, + [-1.28509235, -1.75626254]) + + >>> p = dist2.prob(value_tensor) + >>> print(p) + Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True, + [0.27662504, 0.17268908]) + + >>> entropy = dist2.entropy() + >>> print(entropy) + Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True, + [1.52126312, 2.32064891]) + + """ + + def __init__(self, df, loc, scale, name=None): + if not in_dynamic_mode(): + check_type( + df, + 'df', + ( + float, + Variable, + paddle.pir.Value, + ), + 'StudentT', + ) + check_type( + loc, + 'loc', + ( + float, + Variable, + paddle.pir.Value, + ), + 'StudentT', + ) + check_type( + scale, + 'scale', + ( + float, + Variable, + paddle.pir.Value, + ), + 'StudentT', + ) + + self.name = name if name is not None else 'StudentT' + self.dtype = paddle.get_default_dtype() + + if self._validate_args(df, loc, scale): + self.df = df + self.loc = loc + self.scale = scale + self.df, self.loc, self.scale = paddle.broadcast_tensors( + [self.df, self.loc, self.scale] + ) + self.dtype = convert_dtype(df.dtype) + else: + self.df, self.loc, self.scale = self._to_tensor(df, loc, scale) + + if not self._check_nonnegative(self.df): + raise ValueError( + 'Every element of input parameter `df` should be nonnegative.' + ) + if not self._check_nonnegative(self.scale): + raise ValueError( + 'Every element of input parameter `scale` should be nonnegative.' + ) + + if self.df.shape == []: + self.df = self.df.reshape([1]) + self.loc = self.loc.reshape([1]) + self.scale = self.scale.reshape([1]) + batch_shape = self.df.shape + super().__init__(batch_shape) + self._chi2 = Gamma(0.5 * self.df, paddle.full_like(self.df, 0.5)) + + def _check_nonnegative(self, value): + """Check the non-negative constraint for input parameters + + Args: + value (Tensor) + + Returns: + bool: pass or not. + """ + return (value >= 0.0).all() + + @property + def mean(self): + """Mean of StudentT distribution. + + Returns: + Tensor: mean value. + """ + return paddle.where( + self.df > 1.0, + self.loc, + paddle.full_like(self.loc, fill_value=float('nan')), + ) + + @property + def variance(self): + """Variance of StudentT distribution. + + Returns: + Tensor: variance value. + """ + var = self.df.clone().detach() + var_condition = self.df > 2.0 + var = paddle.where( + var_condition, + self.scale.pow(2) * var / (var - 2), + paddle.full_like(var, fill_value=float('nan')), + ) + inf_condition = (self.df <= 2.0).logical_and(self.df > 1.0) + var = paddle.where( + inf_condition, paddle.full_like(var, fill_value=float('inf')), var + ) + return var + + def sample(self, shape=()): + """Generate StudentT samples of the specified shape. The final shape would be ``shape+batch_shape`` . + + Args: + shape (Sequence[int], optional): Prepended shape of the generated samples. + + Returns: + Tensor: Sampled data with shape `sample_shape` + `batch_shape`. + """ + if not isinstance(shape, Sequence): + raise TypeError('sample shape must be Sequence object.') + + output_shape = self._extend_shape(shape) + z = paddle.cast(paddle.normal(shape=output_shape), self.dtype) + chi2 = self._chi2.sample(shape) + x = z * paddle.rsqrt(chi2 / self.df) + return self.loc + self.scale * x + + def entropy(self): + r"""Shannon entropy in nats. + + The entropy is + + .. math:: + + H = \log(\frac{\Gamma(\nu/2)\Gamma(1/2) \sigma \sqrt{\nu}}{\Gamma[(1+\nu)/2]}) + \frac{(1+\nu)}{2} \cdot \{\psi[(1+\nu)/2] - \psi(\nu/2)\} + + In the above equation: + + * :math:`\nu`: is the degree of freedom. + * :math:`\Gamma()`: is the gamma function. + * :math:`\psi()`: is the digamma function. + + Returns: + Tensor: Shannon entropy of StudentT distribution. The data type is the same as `df`. + """ + lbeta = ( + paddle.lgamma(0.5 * self.df) + + math.lgamma(0.5) + - paddle.lgamma(0.5 * (self.df + 1)) + ) + return ( + self.scale.log() + + 0.5 + * (self.df + 1) + * ( + paddle.digamma(0.5 * (self.df + 1)) + - paddle.digamma(0.5 * self.df) + ) + + 0.5 * self.df.log() + + lbeta + ) + + def log_prob(self, value): + """Log probability density function. + + Args: + value (Tensor): The input tensor. + + Returns: + Tensor: log probability density. The data type is the same as `df`. + """ + value = self._check_values_dtype_in_probs(self.df, value) + y = (value - self.loc) / self.scale + Z = ( + self.scale.log() + + 0.5 * self.df.log() + + 0.5 * math.log(math.pi) + + paddle.lgamma(0.5 * self.df) + - paddle.lgamma(0.5 * (self.df + 1.0)) + ) + return -0.5 * (self.df + 1.0) * paddle.log1p(y**2.0 / self.df) - Z + + def prob(self, value): + """Probability density function. + + Args: + value (Tensor): The input tensor. + + Returns: + Tensor: probability density. The data type is the same as `df`. + """ + return paddle.exp(self.log_prob(value)) diff --git a/python/paddle/incubate/nn/functional/block_multihead_attention.py b/python/paddle/incubate/nn/functional/block_multihead_attention.py index a55f61de2c678..596b9581570ad 100644 --- a/python/paddle/incubate/nn/functional/block_multihead_attention.py +++ b/python/paddle/incubate/nn/functional/block_multihead_attention.py @@ -389,3 +389,156 @@ def block_multihead_attention( }, ) return out, qkv, key_cache, value_cache + + +def block_multihead_attention_xpu( + qkv, + key_cache, + value_cache, + seq_lens_encoder, + seq_lens_decoder, + seq_lens_this_time, + padding_offsets, + cum_offsets, + cu_seqlens_q, + cu_seqlens_k, + block_tables, + cache_k_per_batch_maxs, + cache_v_per_batch_maxs, + pre_key_cache=None, + pre_value_cache=None, + cache_k_quant_scales=None, + cache_v_quant_scales=None, + cache_k_dequant_scales=None, + cache_v_dequant_scales=None, + qkv_out_scale=None, + qkv_bias=None, + out_shift=None, + out_smooth=None, + max_enc_len_this_time=None, + max_dec_len_this_time=None, + rope_emb=None, + mask=None, + tgt_mask=None, + max_seq_len=-1, + block_size=64, + use_neox_style=False, + use_dynamic_cachekv_quant=False, + quant_round_type=1, + quant_max_bound=127.0, + quant_min_bound=-127.0, + out_scale=-1, + compute_dtype="default", +): + if in_dynamic_mode(): + return _C_ops.block_multihead_attention_xpu( + qkv, + key_cache, + value_cache, + seq_lens_encoder, + seq_lens_decoder, + seq_lens_this_time, + padding_offsets, + cum_offsets, + cu_seqlens_q, + cu_seqlens_k, + block_tables, + cache_k_per_batch_maxs, + cache_v_per_batch_maxs, + pre_key_cache, + pre_value_cache, + rope_emb, + mask, + tgt_mask, + cache_k_quant_scales, + cache_v_quant_scales, + cache_k_dequant_scales, + cache_v_dequant_scales, + qkv_out_scale, + qkv_bias, + out_shift, + out_smooth, + max_enc_len_this_time, + max_dec_len_this_time, + max_seq_len, + block_size, + use_neox_style, + use_dynamic_cachekv_quant, + quant_round_type, + quant_max_bound, + quant_min_bound, + out_scale, + compute_dtype, + ) + + helper = LayerHelper('block_multihead_attention_xpu', **locals()) + out = helper.create_variable_for_type_inference(dtype=qkv.dtype) + + inputs = {} + inputs['qkv'] = qkv + inputs['key_cache'] = key_cache + inputs['value_cache'] = value_cache + inputs['seq_lens_encoder'] = seq_lens_encoder + inputs['seq_lens_decoder'] = seq_lens_decoder + inputs['seq_lens_this_time'] = seq_lens_this_time + inputs['padding_offsets'] = padding_offsets + inputs['cum_offsets'] = cum_offsets + inputs['cu_seqlens_q'] = cu_seqlens_q + inputs['cu_seqlens_k'] = cu_seqlens_k + inputs['block_tables'] = block_tables + inputs['cache_k_per_batch_maxs'] = cache_k_per_batch_maxs + inputs['cache_v_per_batch_maxs'] = cache_v_per_batch_maxs + if pre_key_cache is not None: + inputs['pre_key_cache'] = pre_key_cache + if pre_value_cache is not None: + inputs['pre_value_cache'] = pre_value_cache + if rope_emb is not None: + inputs['rope_emb'] = rope_emb + if mask is not None: + inputs['mask'] = mask + if tgt_mask is not None: + inputs['tgt_mask'] = tgt_mask + if cache_k_quant_scales is not None: + inputs["cache_k_quant_scales"] = cache_k_quant_scales + if cache_v_quant_scales is not None: + inputs["cache_v_quant_scales"] = cache_v_quant_scales + if cache_k_dequant_scales is not None: + inputs["cache_k_dequant_scales"] = cache_k_dequant_scales + if cache_v_dequant_scales is not None: + inputs["cache_v_dequant_scales"] = cache_v_dequant_scales + if qkv_out_scale is not None: + inputs["qkv_out_scale"] = qkv_out_scale + if qkv_bias is not None: + inputs["qkv_bias"] = qkv_bias + if out_shift is not None: + inputs["out_shift"] = out_shift + if out_smooth is not None: + inputs["out_smooth"] = out_smooth + if max_enc_len_this_time is not None: + inputs["max_enc_len_this_time"] = max_enc_len_this_time + if max_dec_len_this_time is not None: + inputs["max_dec_len_this_time"] = max_dec_len_this_time + + outputs = { + 'fmha_out': out, + 'qkv_out': qkv, + 'key_cache_out': key_cache, + 'value_cache_out': value_cache, + } + helper.append_op( + type='block_multihead_attention_xpu', + inputs=inputs, + outputs=outputs, + attrs={ + 'max_seq_len': max_seq_len, + 'block_size': block_size, + 'use_neox_style': use_neox_style, + 'dynamic_cachekv_quant': use_dynamic_cachekv_quant, + 'quant_round_type': quant_round_type, + 'quant_max_bound': quant_max_bound, + 'quant_min_bound': quant_min_bound, + 'out_scale': out_scale, + 'compute_dtype': compute_dtype, + }, + ) + return out, qkv, key_cache, value_cache diff --git a/python/paddle/jit/api.py b/python/paddle/jit/api.py index 56a0d8a613be6..2367d5518ed92 100644 --- a/python/paddle/jit/api.py +++ b/python/paddle/jit/api.py @@ -337,14 +337,12 @@ def output_spec(self, spec): return if not isinstance(spec, list): raise TypeError( - "The config `output_spec` should be 'list', but received input type is %s." - % type(input) + f"The config `output_spec` should be 'list', but received input type is {type(input)}." ) for var in spec: if not isinstance(var, core.eager.Tensor): raise TypeError( - "The element in config `output_spec` list should be 'Variable', but received element's type is %s." - % type(var) + f"The element in config `output_spec` list should be 'Variable', but received element's type is {type(var)}." ) self._output_spec = spec @@ -358,8 +356,7 @@ def model_filename(self, filename): return if not isinstance(filename, str): raise TypeError( - "The config `model_filename` should be str, but received input's type is %s." - % type(filename) + f"The config `model_filename` should be str, but received input's type is {type(filename)}." ) if len(filename) == 0: raise ValueError("The config `model_filename` is empty string.") @@ -375,8 +372,7 @@ def params_filename(self, filename): return if not isinstance(filename, str): raise TypeError( - "The config `params_filename` should be str, but received input's type is %s." - % type(filename) + f"The config `params_filename` should be str, but received input's type is {type(filename)}." ) if len(filename) == 0: raise ValueError("The config `params_filename` is empty string.") @@ -392,8 +388,7 @@ def keep_name_table(self, value): return if not isinstance(value, bool): raise TypeError( - "The config `keep_name_table` should be bool value, but received input's type is %s." - % type(value) + f"The config `keep_name_table` should be bool value, but received input's type is {type(value)}." ) self._keep_name_table = value @@ -413,8 +408,7 @@ def _parse_save_configs(configs): for key in configs: if key not in supported_configs: raise ValueError( - "The additional config (%s) of `paddle.jit.save` is not supported." - % (key) + f"The additional config ({key}) of `paddle.jit.save` is not supported." ) # construct inner config @@ -439,8 +433,7 @@ def _parse_load_config(configs): for key in configs: if key not in supported_configs: raise ValueError( - "The additional config (%s) of `paddle.jit.load` is not supported." - % (key) + f"The additional config ({key}) of `paddle.jit.load` is not supported." ) # construct inner config @@ -554,7 +547,7 @@ def _get_output_vars(outputs, output_spec, with_hook=False): output_size = len(result_list) if len(output_spec) == output_size: for var in output_spec: - if not isinstance(var, paddle.pir.Value, int): + if not isinstance(var, (paddle.pir.Value, int)): warnings.warn(output_spec_is_not_value_error % var.name) else: if var not in ValueSet(result_list): @@ -636,9 +629,9 @@ def _build_load_path_and_config(path, config): ) elif not prefix_format_exist and not directory_format_exist: raise ValueError( - "The ``path`` (%s) to load model not exists. " + f"The ``path`` ({path}) to load model not exists. " "Please make sure that *.pdmodel exists or " - "don't using ``skip_forward=True`` to jit.save." % path + "don't using ``skip_forward=True`` to jit.save." ) else: if prefix_format_exist: @@ -954,8 +947,7 @@ def save(layer, path, input_spec=None, **configs): isinstance(layer, (Layer, StaticFunction)) or inspect.isfunction(layer) ): raise TypeError( - "The input of paddle.jit.save should be 'Layer' or 'Function', but received input type is %s." - % type(layer) + f"The input of paddle.jit.save should be 'Layer' or 'Function', but received input type is {type(layer)}." ) elif inspect.isfunction(layer) or isinstance(layer, StaticFunction): warnings.warn( @@ -996,14 +988,12 @@ def save(layer, path, input_spec=None, **configs): and 'forward' != attr_func ): raise ValueError( - "If there are static functions other than 'forward' that need to be saved, the input 'input_spec' should be None, but received the type of 'input_spec' is %s." - % type(input_spec) + f"If there are static functions other than 'forward' that need to be saved, the input 'input_spec' should be None, but received the type of 'input_spec' is {type(input_spec)}." ) if not isinstance(input_spec, (list, tuple)): raise TypeError( - "The input input_spec should be 'list', but received input_spec's type is %s." - % type(input_spec) + f"The input input_spec should be 'list', but received input_spec's type is {type(input_spec)}." ) inner_input_spec = [] for var in paddle.utils.flatten(input_spec): diff --git a/python/paddle/jit/dy2static/ast_utils.py b/python/paddle/jit/dy2static/ast_utils.py index fc703dd6f6e49..7c4c90ec44d0e 100644 --- a/python/paddle/jit/dy2static/ast_utils.py +++ b/python/paddle/jit/dy2static/ast_utils.py @@ -27,8 +27,7 @@ def ast_to_source_code(ast_node): """ if not isinstance(ast_node, (gast.AST, ast.AST)): raise TypeError( - "Type of ast_root should be gast.AST or ast.AST, but received %s." - % type(ast_node) + f"Type of ast_root should be gast.AST or ast.AST, but received {type(ast_node)}." ) if isinstance(ast_node, gast.AST): ast_node = gast.gast_to_ast(ast_node) diff --git a/python/paddle/jit/dy2static/convert_operators.py b/python/paddle/jit/dy2static/convert_operators.py index 7ef8b4ce88736..10d2c9633ae80 100644 --- a/python/paddle/jit/dy2static/convert_operators.py +++ b/python/paddle/jit/dy2static/convert_operators.py @@ -615,8 +615,7 @@ def convert_len(var): return paddle.tensor.array_length(var) else: raise TypeError( - 'len(var) only supports LoDTensor/LoDTensorArray/SelectedRows, but received %s.' - % type(var) + f'len(var) only supports LoDTensor/LoDTensorArray/SelectedRows, but received {type(var)}.' ) elif isinstance(var, Value): if var.is_dense_tensor_type() or var.is_selected_row_type(): diff --git a/python/paddle/jit/dy2static/function_spec.py b/python/paddle/jit/dy2static/function_spec.py index 7d5605f547df8..ce0b8382e9d01 100644 --- a/python/paddle/jit/dy2static/function_spec.py +++ b/python/paddle/jit/dy2static/function_spec.py @@ -179,7 +179,7 @@ def pir_to_static_inputs_with_spec(self, input_with_spec, main_program): if isinstance(var_spec, paddle.static.InputSpec): stop_gradient = getattr(var_spec, 'stop_gradient', False) feed_value = paddle.static.input.data( - name=var_spec.name or "feed_%s" % i, + name=var_spec.name or f"feed_{i}", shape=var_spec.shape, dtype=convert_dtype(var_spec.dtype), ) @@ -232,7 +232,7 @@ def to_static_inputs_with_spec(self, input_with_spec, main_program): stop_gradient = getattr(var_spec, 'stop_gradient', False) feed_layer = block.create_var( # TODO(Aurelius84): consider a more elegant way to name this - name=var_spec.name or "feed_%s" % i, + name=var_spec.name or f"feed_{i}", shape=var_spec.shape, dtype=var_spec.dtype, is_data=True, diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py index 8571740db2659..f4fc6ea387f97 100644 --- a/python/paddle/jit/dy2static/partial_program.py +++ b/python/paddle/jit/dy2static/partial_program.py @@ -1108,8 +1108,7 @@ def _check_params_all_inited(self, main_program): """ if not isinstance(self._params, (list, tuple)): raise TypeError( - "Type of self._params in PartialProgramLayer should be list or tuple, but received %s." - % type(self._params) + f"Type of self._params in PartialProgramLayer should be list or tuple, but received {type(self._params)}." ) param_and_buffer_names_set = set() @@ -1127,12 +1126,11 @@ def _check_params_all_inited(self, main_program): if name not in param_and_buffer_names_set: raise ValueError( "\n\tWe don't support to define layer with parameters in the function decorated by `@to_static`." - "\n\tBut we found parameter(%s) was created in the decorated function." + f"\n\tBut we found parameter({name}) was created in the decorated function." "\n" "\n\tRevise suggestion: " "\n\t\t1. Please ensure all your sublayers are inherited from nn.Layer." "\n\t\t2. Please use nn.ParameterList and nn.LayerList as container instead of using a native Python container such as List" - % name ) def _valid_vars(self, vars): diff --git a/python/paddle/jit/dy2static/pir_partial_program.py b/python/paddle/jit/dy2static/pir_partial_program.py index 55d8ab47e92a4..ff6ee46c8a1f9 100644 --- a/python/paddle/jit/dy2static/pir_partial_program.py +++ b/python/paddle/jit/dy2static/pir_partial_program.py @@ -1257,8 +1257,7 @@ def _check_params_all_inited(self, main_program): """ if not isinstance(self._params, (list, tuple)): raise TypeError( - "Type of self._params in PartialProgramLayer should be list or tuple, but received %s." - % type(self._params) + f"Type of self._params in PartialProgramLayer should be list or tuple, but received {type(self._params)}." ) param_and_buffer_names_set = set() diff --git a/python/paddle/jit/dy2static/program_translator.py b/python/paddle/jit/dy2static/program_translator.py index ea4040485b64a..d1a85626c17fc 100644 --- a/python/paddle/jit/dy2static/program_translator.py +++ b/python/paddle/jit/dy2static/program_translator.py @@ -1597,8 +1597,7 @@ def _build_once(self, cache_key): def __getitem__(self, item): if not isinstance(item, CacheKey): raise ValueError( - 'type(item) should be CacheKey, but received %s' - % type_name(item) + f'type(item) should be CacheKey, but received {type_name(item)}' ) item_id = hash(item) self._recent_cache_key = item @@ -1621,8 +1620,7 @@ def get_program_without_cache(self, cache_key): def get_program(self, item): if not isinstance(item, CacheKey): raise ValueError( - "Input item's type should be FunctionSpec, but received %s" - % type_name(item) + f"Input item's type should be FunctionSpec, but received {type_name(item)}" ) item_id = hash(item) if item_id not in self._caches: diff --git a/python/paddle/jit/dy2static/transformers/early_return_transformer.py b/python/paddle/jit/dy2static/transformers/early_return_transformer.py index 4dab1e5ab1638..ce8cf9e606878 100644 --- a/python/paddle/jit/dy2static/transformers/early_return_transformer.py +++ b/python/paddle/jit/dy2static/transformers/early_return_transformer.py @@ -36,9 +36,7 @@ def transform(self): def is_define_return_in_if(self, node): assert isinstance( node, gast.If - ), "Type of input node should be gast.If, but received %s ." % type( - node - ) + ), f"Type of input node should be gast.If, but received {type(node)}." for child in node.body: if isinstance(child, gast.Return): return True diff --git a/python/paddle/jit/dy2static/utils.py b/python/paddle/jit/dy2static/utils.py index ad195befba4b5..03a2cd06d3211 100644 --- a/python/paddle/jit/dy2static/utils.py +++ b/python/paddle/jit/dy2static/utils.py @@ -204,7 +204,7 @@ def make_hashable(x, error_msg=None): return tuple(map(make_hashable, x.values())) error_msg = error_msg or "Requires a hashable object." - raise ValueError(error_msg + " But received type: %s" % type_name(x)) + raise ValueError(f"{error_msg} But received type: {type_name(x)}") return x @@ -327,8 +327,7 @@ def func_prefix(func): callable_func = getattr(module, func_name) else: raise ValueError( - 'Function: %s doesn\'t exist in the Module transformed from AST.' - % func_name + f'Function: {func_name} doesn\'t exist in the Module transformed from AST.' ) # After transform dygraph function into callable_func saved in tmp file, # it lost the global variables from imported statements or defined in source file. diff --git a/python/paddle/jit/pir_translated_layer.py b/python/paddle/jit/pir_translated_layer.py index 8a6e3ede35e2a..6bdf8f2952d8d 100644 --- a/python/paddle/jit/pir_translated_layer.py +++ b/python/paddle/jit/pir_translated_layer.py @@ -217,7 +217,6 @@ def _load_pir_parameter_vars(model_path, program_holder, params_filename): # load all vars assert params_filename is not None, "params_filename should not be None." var_file_path = os.path.join(model_path, params_filename) - if os.path.exists(var_file_path): core.load_combine_func( var_file_path, @@ -228,8 +227,7 @@ def _load_pir_parameter_vars(model_path, program_holder, params_filename): ) else: raise ValueError( - "The file %s does not exist. Please check the model path." - % var_file_path + f"The file {var_file_path} does not exist. Please check the model path." ) load_var_dict.update(other_var_dict) @@ -328,8 +326,7 @@ def _run_dygraph(instance, input, program_holder): for i, value in enumerate(input): if not isinstance(value, (np.ndarray, core.eager.Tensor)): raise TypeError( - "The type of input in PirTranslatedLayer must be numpy array or Variable(Tensor), but received %s." - % type(value) + f"The type of input in PirTranslatedLayer must be numpy array or Variable(Tensor), but received {type(value)}." ) # NOTE: In order to unify the API, firstly convert the input to Tensor if isinstance(value, np.ndarray): @@ -361,8 +358,7 @@ def _run_dygraph(instance, input, program_holder): persistable_tensors.append(instance._buffers[dy_var_name]) else: raise ValueError( - "The persistable variable %s does not exist in current PirTranslatedLayer." - % var_name + f"The persistable variable {var_name} does not exist in current PirTranslatedLayer." ) from paddle.jit.dy2static.pir_partial_program import PartialProgramLayer @@ -378,7 +374,6 @@ def _run_dygraph(instance, input, program_holder): parameters, ) instance.layer = layer - if instance._is_test: layer.training = False else: @@ -392,9 +387,42 @@ def _run_dygraph(instance, input, program_holder): return instance.layer(input_tensors) -def _run_static_graph(program_holder, trace_program): - paddle.base.framework.switch_main_program(trace_program) - return program_holder.output_vars +def _run_static_graph(inputs, program_holder, src_program): + ''' + This function is used when the pirTranslatedLayer is + applied for dy_to_static conversion. + ''' + dst_program = paddle.static.default_main_program() + value_map = paddle.pir.IrMapping() + # Establish a mapping relationship between existing parameters + # and corresponding parameters in the program to be copied + len_dst_op = len(dst_program.global_block().ops) + for dst_op in dst_program.global_block().ops: + if dst_op.name() == "builtin.parameter": + for src_op in src_program.global_block().ops[:len_dst_op]: + if ( + src_op.name() == dst_op.name() + and src_op.result(0).name == dst_op.result(0).name + ): + for i in range(src_op.num_results()): + value_map.add(src_op.result(i), dst_op.result(i)) + # Establish a mapping relationship between truly inputs + # and corresponding inputs in the program to be copied + src_inputs = program_holder.input_vars + if len(src_inputs) != len(inputs): + raise ValueError( + f"The number of input is invalid, expected {len(src_inputs)}, but received {len(inputs)}." + ) + for src_input, input_ in zip(src_inputs, inputs): + value_map.add(src_input, input_) + + # find the insert point for copy + current_insert_point = paddle.pir.get_current_insertion_point() + current_block = current_insert_point.block() + src_program.copy_to_block(value_map, current_block) + + output = [value_map.look_up(v) for v in program_holder.output_vars] + return output[0] if len(output) == 1 else output def _collect_current_and_parent_var(program, block_idx): @@ -561,7 +589,7 @@ def _construct(model_path, configs=None): # 0. dir and filename check model_path = os.path.normpath(model_path) if not os.path.isdir(model_path): - raise ValueError("There is no directory named '%s'" % model_path) + raise ValueError(f"There is no directory named '{model_path}'") model_filename = None params_filename = None if configs is not None: @@ -608,7 +636,7 @@ def __i_m_p_l__(self, *input): return _run_dygraph(self, input, program_holder) else: return _run_static_graph( - program_holder, program_holder.infer_program + input, program_holder, program_holder.infer_program ) __i_m_p_l__.__name__ = method_name @@ -719,8 +747,7 @@ def _get_program_holder(self, method_name='forward'): program_holder = self._program_holder_dict.get(method_name, None) if program_holder is None: raise ValueError( - "The method `%s` does not exist in loaded PirTranslatedLayer." - % method_name + f"The method `{method_name}` does not exist in loaded PirTranslatedLayer." ) return program_holder diff --git a/python/paddle/jit/sot/infer_meta.py b/python/paddle/jit/sot/infer_meta.py index 3ec9f0d891c9e..a67b10c27105f 100644 --- a/python/paddle/jit/sot/infer_meta.py +++ b/python/paddle/jit/sot/infer_meta.py @@ -11,8 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations from functools import cached_property +from typing import TypeVar import paddle from paddle.amp.auto_cast import amp_state @@ -26,10 +28,32 @@ from .utils import Cache, Singleton, map_if_extend, meta_str +DynamicSymbolT = TypeVar("DynamicSymbolT") + + +class SymbolicInt(metaclass=Singleton): + def __eq__(self, other) -> bool: + return isinstance(other, (int, SymbolicInt)) + + def __repr__(self) -> str: + return "SymbolicInt()" + + def __str__(self) -> str: + return "SymbolicInt()" + class MetaInfo: def __init__( - self, shape, dtype, stop_gradient, name, persistable, type, place + self, + shape, + dtype, + stop_gradient, + name, + persistable, + type, + place, + *, + dynamic_axes: list[int] | None = None, ): self.name = name self.persistable = persistable @@ -38,9 +62,18 @@ def __init__( self.shape = shape self.dtype = dtype self.stop_gradient = stop_gradient + self.dynamic_axes = dynamic_axes or [] + + def get_dynamic_shape( + self, dynamic_symbol: DynamicSymbolT = -1 + ) -> list[int | DynamicSymbolT]: + return [ + dim if i not in self.dynamic_axes else dynamic_symbol + for i, dim in enumerate(self.shape) + ] @staticmethod - def from_tensor(tensor): + def from_tensor(tensor, *, dynamic_axes: list[int] | None = None): if isinstance(tensor, paddle.pir.Value): name = "Value@NoName" else: # For Tensor or Variable @@ -54,6 +87,7 @@ def from_tensor(tensor): ) assert isinstance(dtype, expected_dtype_class) + # TODO(@xiongkun) remove after pir become default state. # We always use float32 in simulation if AMP is enabled. current_amp_state = amp_state() if ( @@ -63,7 +97,12 @@ def from_tensor(tensor): and current_amp_state["dtype"] == "float16" ): dtype = paddle.float32 - # TODO(@xiongkun) remove after pir become default state. + dynamic_axes = dynamic_axes or [] + dynamic_axes = [ + i + for i, dim in enumerate(tensor.shape) + if dim == -1 or i in dynamic_axes + ] return MetaInfo( list(tensor.shape), dtype, @@ -72,6 +111,7 @@ def from_tensor(tensor): persistable, tensor.type, tensor.place, + dynamic_axes=dynamic_axes, ) def is_dynamic_shape(self): @@ -82,12 +122,14 @@ def is_dynamic_shape(self): return -1 in self.shape def to_input_spec(self): + shape = self.get_dynamic_shape(None) return paddle.static.InputSpec( - self.shape, dtype=self.dtype, stop_gradient=self.stop_gradient + shape, dtype=self.dtype, stop_gradient=self.stop_gradient ) def guard_str(self): - return f"({self.shape}, {self.dtype}, {self.stop_gradient})" + shape = self.get_dynamic_shape(SymbolicInt()) + return f"({shape}, {self.dtype}, {self.stop_gradient})" def __repr__(self): return meta_str(self.shape, self.dtype, self.stop_gradient) @@ -161,20 +203,22 @@ def startup_program(self): else: return self.legacy_programs[1] - def create_var(self, meta): + def create_var(self, meta: MetaInfo): + shape = meta.get_dynamic_shape() + if paddle.framework.use_pir_api(): with paddle.static.program_guard( self.main_program, self.startup_program ): var = paddle.static.input.data( name=self.gen_name(meta), - shape=meta.shape, + shape=shape, dtype=convert_dtype(meta.dtype), ) var.stop_gradient = meta.stop_gradient else: var = self.main_program.global_block().create_var( - shape=meta.shape, + shape=shape, dtype=meta.dtype, stop_gradient=meta.stop_gradient, ) @@ -193,9 +237,10 @@ def infer_meta(self, func, *args, **kwargs): with paddle.base.framework._dygraph_guard(None), UniqueNameGuard( self.var_name_generator ): - args, kwargs = convert_meta_to_variable( - args - ), convert_meta_to_variable(kwargs) + args, kwargs = ( + convert_meta_to_variable(args), + convert_meta_to_variable(kwargs), + ) with paddle.static.program_guard( self.main_program, self.startup_program @@ -225,9 +270,11 @@ def convert_meta_to_input_spec(args): pred=lambda x: isinstance(x, MetaInfo), true_fn=lambda x: x.to_input_spec(), # TODO(xiongkun): can x be tensor ? - false_fn=lambda x: paddle.static.InputSpec.from_tensor(x) - if isinstance(x, paddle.Tensor) - else x, + false_fn=lambda x: ( + paddle.static.InputSpec.from_tensor(x) + if isinstance(x, paddle.Tensor) + else x + ), ) diff --git a/python/paddle/jit/sot/opcode_translator/executor/executor_cache.py b/python/paddle/jit/sot/opcode_translator/executor/executor_cache.py index f94884d0c118b..bbefddda639ad 100644 --- a/python/paddle/jit/sot/opcode_translator/executor/executor_cache.py +++ b/python/paddle/jit/sot/opcode_translator/executor/executor_cache.py @@ -56,12 +56,16 @@ class OpcodeExecutorCache(metaclass=Singleton): MAX_CACHE_SIZE = 20 cache: dict[types.CodeType, GuardedFunctions] translate_count: int - symbolic_inputs: dict[str, dict[int, int]] + code_symbolic_inputs: dict[types.CodeType, dict[str, dict[int, int]]] def __init__(self): self.cache = {} self.translate_count = 0 - self.symbolic_inputs = {} + self.code_symbolic_inputs = {} + + def get_symbolic_inputs(self, code: types.CodeType): + self.code_symbolic_inputs.setdefault(code, {}) + return self.code_symbolic_inputs[code] def clear(self): """ @@ -69,6 +73,7 @@ def clear(self): """ self.cache.clear() self.translate_count = 0 + self.code_symbolic_inputs.clear() def __call__(self, frame: types.FrameType, **kwargs) -> CustomCode: code: types.CodeType = frame.f_code diff --git a/python/paddle/jit/sot/opcode_translator/executor/function_graph.py b/python/paddle/jit/sot/opcode_translator/executor/function_graph.py index 99ea75ebbcd48..93de3c8dfe815 100644 --- a/python/paddle/jit/sot/opcode_translator/executor/function_graph.py +++ b/python/paddle/jit/sot/opcode_translator/executor/function_graph.py @@ -22,9 +22,9 @@ from collections import namedtuple from copy import deepcopy from functools import cached_property -from typing import Any, Callable +from typing import Any, Callable, Tuple, Union -from typing_extensions import TypeGuard +from typing_extensions import TypeAlias, TypeGuard import paddle from paddle.jit.utils import OrderedSet @@ -37,7 +37,7 @@ ast_infer_meta, ) from ...profiler import EventGuard, event_register -from ...symbolic.statement_ir import Reference, Symbol +from ...symbolic.statement_ir import Reference, StatementIR, Symbol from ...symbolic.symbolic_context import SymbolicTraceContext from ...utils import ( NameGenerator, @@ -81,6 +81,15 @@ map_variables, ) +CompileGraphResult: TypeAlias = Tuple[ + Callable[..., Any], + Tuple[ + StatementIR, + OrderedSet[Union[TensorVariable, SymbolicVariable]], + OrderedSet[Union[TensorVariable, SymbolicVariable]], + ], +] + def convert_to_meta(inputs: Any): """ @@ -329,7 +338,7 @@ def _restore_origin_opcode(self, stack_vars, store_var_info, instr_idx): self.pycode_gen.gen_enable_eval_frame() - name_gen = NameGenerator("__start_compile_saved_orig_") + name_gen = NameGenerator("___compile_fn_saved_orig_") # here is not update changed values, it just give names to stack vars # and want keep same interface as _build_compile_fn_with_name_store @@ -344,13 +353,18 @@ def _restore_origin_opcode(self, stack_vars, store_var_info, instr_idx): return VariableLoader(store_var_info, self.pycode_gen) - def _build_compile_fn_with_name_store(self, to_store_vars, store_var_info): + def _build_compile_fn_with_name_store( + self, + compile_graph_result: CompileGraphResult, + to_store_vars, + store_var_info, + ): # var_id -> local_name mapping to_store_vars = list( filter(lambda x: not isinstance(x, NullVariable), to_store_vars) ) - self.start_compile(*to_store_vars) - name_gen = NameGenerator("__start_compile_saved_") + self.compile_function(compile_graph_result, to_store_vars) + name_gen = NameGenerator("___compile_fn_saved_") for var in to_store_vars[::-1]: if store_var_info[var.id] is None: @@ -363,23 +377,38 @@ def _build_compile_fn_with_name_store(self, to_store_vars, store_var_info): return VariableLoader(store_var_info, self.pycode_gen) - def get_compiled_fn(self, *ret_vars): + def compile_graph(self, *ret_vars: VariableBase) -> CompileGraphResult: ret_items = [ ret_item for ret_var in ret_vars for ret_item in ret_var.flatten_items() ] - tensor_items = self._find_tensor_outputs(ret_items) - compiled_fn, _ = self.sir_ctx.compile_fn( - [Symbol(tensor_var.var_name) for tensor_var in tensor_items], + symbolic_outputs = self._find_tensor_outputs(ret_items) + statement_ir = self.sir_ctx.return_TOS( + [Symbol(tensor_var.var_name) for tensor_var in symbolic_outputs] + ) + if not statement_ir.statements: + return self.sir_ctx.compile_do_nothing(), ( + statement_ir, + OrderedSet(), + OrderedSet(), + ) + input_names = statement_ir.inputs + symbolic_inputs = self._find_tensor_inputs(input_names) + compiled_fn = self.sir_ctx.compile_fn( + statement_ir.name, + [var.meta.to_input_spec() for var in symbolic_inputs], **self._kwargs, ) + return compiled_fn, (statement_ir, symbolic_inputs, symbolic_outputs) - return compiled_fn - - @event_register("start_compile", event_level=2) - def start_compile(self, *ret_vars: VariableBase): + @event_register("compile_function", event_level=2) + def compile_function( + self, + compile_graph_result: CompileGraphResult, + ret_vars: list[VariableBase], + ): """ Generate bytecode based on the information collected by the simulation execution. @@ -393,48 +422,24 @@ def start_compile(self, *ret_vars: VariableBase): """ from ..breakpoint import BreakpointManager - BreakpointManager().on_event("start_compile") - - ret_items = [ - ret_item - for ret_var in ret_vars - for ret_item in ret_var.flatten_items() - ] - - tensor_items = self._find_tensor_outputs(ret_items) - compiled_fn, statement_ir = self.sir_ctx.compile_fn( - [Symbol(tensor_var.var_name) for tensor_var in tensor_items], - **self._kwargs, - ) - input_names = statement_ir.inputs - compiled_fn_name = f"__compiled_fn_{statement_ir.name}" + BreakpointManager().on_event("compile_function") + graph_fn, ( + statement_ir, + symbolic_inputs, + symbolic_outputs, + ) = compile_graph_result + compiled_fn_name = f"___graph_fn_{statement_ir.name}" # prepare function and inputs - self.pycode_gen.gen_load_object(compiled_fn, compiled_fn_name) - for name in input_names: - found = False - for variable in self.input_variables: - if ( - isinstance(variable, (TensorVariable, SymbolicVariable)) - and variable.get_symbol().name == name - ): - if isinstance(variable, SymbolicVariable): - self.pycode_gen.gen_load_object( - paddle.to_tensor, "___paddle_to_tensor" - ) - variable.tracker.gen_instructions(self.pycode_gen) - found = True - if isinstance(variable, SymbolicVariable): - self.pycode_gen.gen_call_function(1) - break - assert found, f"can't find input {name} in SIR." + self.pycode_gen.gen_load_object(graph_fn, compiled_fn_name) + self.gen_load_inputs(symbolic_inputs) # Pack all args into a tuple, because we don't support *args now. - self.pycode_gen.gen_build_tuple(count=len(input_names)) - # call the compiled_fn + self.pycode_gen.gen_build_tuple(count=len(symbolic_inputs)) + # call the graph_fn self.pycode_gen.gen_call_function(argc=1) # Store outputs to f_locals - self.pycode_gen.gen_unpack_sequence(count=len(tensor_items)) - for tensor_var in tensor_items: + self.pycode_gen.gen_unpack_sequence(count=len(symbolic_outputs)) + for tensor_var in symbolic_outputs: self.pycode_gen.gen_store_fast(tensor_var.out_var_name) # restore the outputs. for ret_var in ret_vars: @@ -725,6 +730,36 @@ def remove_global_guarded_variable(self, variable: VariableBase): if variable in self._global_guarded_variables: self._global_guarded_variables.remove(variable) + def _find_tensor_inputs( + self, input_names: list[str] + ) -> OrderedSet[TensorVariable | SymbolicVariable]: + inputs: OrderedSet[TensorVariable | SymbolicVariable] = OrderedSet() + for name in input_names: + found = False + for variable in self.input_variables: + if ( + isinstance(variable, (TensorVariable, SymbolicVariable)) + and variable.get_symbol().name == name + ): + inputs.add(variable) + found = True + break + assert found, f"can't find input {name} in SIR." + assert len(inputs) == len(input_names), "Number of inputs not match." + return inputs + + def gen_load_inputs( + self, inputs: OrderedSet[TensorVariable | SymbolicVariable] + ): + for input_var in inputs: + if isinstance(input_var, SymbolicVariable): + self.pycode_gen.gen_load_object( + paddle.to_tensor, "___paddle_to_tensor" + ) + input_var.tracker.gen_instructions(self.pycode_gen) + if isinstance(input_var, SymbolicVariable): + self.pycode_gen.gen_call_function(1) + def _find_tensor_outputs( self, outputs: list[VariableBase] ) -> OrderedSet[TensorVariable | SymbolicVariable]: @@ -738,12 +773,14 @@ def _find_tensor_outputs( def is_graph_output( var, ) -> TypeGuard[TensorVariable | SymbolicVariable]: - return isinstance(var.tracker, DummyTracker) and isinstance( - var, (TensorVariable, SymbolicVariable) - ) + return isinstance( + var.tracker, (DummyTracker, SymbolicOperationTracker) + ) and isinstance(var, (TensorVariable, SymbolicVariable)) def collect_related_dummy_tensor(var): - if isinstance(var.tracker, DummyTracker): + if isinstance( + var.tracker, (DummyTracker, SymbolicOperationTracker) + ): if is_graph_output(var): return [var] else: @@ -758,7 +795,9 @@ def collect_related_dummy_tensor(var): ] = OrderedSet() # Find Tensor Variables from outputs. for output in outputs: - if isinstance(output.tracker, DummyTracker): + if isinstance( + output.tracker, (DummyTracker, SymbolicOperationTracker) + ): if is_graph_output(output): output_tensors.add(output) else: @@ -809,7 +848,7 @@ def restore_print_stmts(self, variables: list[VariableBase]): add_to_global_guarded_vars=False, ) - def restore_inplace_tensor(self, variables: list[VariableBase]): + def restore_inplace_tensor(self, variables: OrderedSet[VariableBase]): for var in variables: if not var.tracker.is_traceable(): continue diff --git a/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py b/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py index 70870913a6a02..3146609a595b0 100644 --- a/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py +++ b/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py @@ -1737,11 +1737,12 @@ def RETURN_CONST(self, instr: Instruction): return self.compile_return(ret_const) def compile_return(self, ret_val): - compile_fn = self._graph.get_compiled_fn(ret_val) - if compile_fn.graph_size() < ENV_MIN_GRAPH_SIZE.get(): + compile_graph_result = self._graph.compile_graph(ret_val) + graph_fn, _ = compile_graph_result + if graph_fn.graph_size() < ENV_MIN_GRAPH_SIZE.get(): self.new_code = None else: - self._graph.start_compile(ret_val) + self._graph.compile_function(compile_graph_result, [ret_val]) self._graph.pycode_gen.gen_return() self.new_code = self._graph.pycode_gen.gen_pycode() self.guard_fn = self._graph.guard_fn @@ -1775,15 +1776,16 @@ def get_compute_fn_and_update_changed_vars( store_vars.append(_var) store_var_info[_var.id] = name - compile_fn = self._graph.get_compiled_fn(*store_vars) + compile_graph_result = self._graph.compile_graph(*store_vars) + graph_fn, _ = compile_graph_result - if compile_fn.graph_size() < ENV_MIN_GRAPH_SIZE.get(): + if graph_fn.graph_size() < ENV_MIN_GRAPH_SIZE.get(): return self._graph._restore_origin_opcode( list(stack), store_var_info, end_idx ) else: return self._graph._build_compile_fn_with_name_store( - store_vars, store_var_info + compile_graph_result, store_vars, store_var_info ) @fallback_when_occur_error diff --git a/python/paddle/jit/sot/opcode_translator/executor/tracker.py b/python/paddle/jit/sot/opcode_translator/executor/tracker.py index 41ce17dba7cbc..85a7f68f6847a 100644 --- a/python/paddle/jit/sot/opcode_translator/executor/tracker.py +++ b/python/paddle/jit/sot/opcode_translator/executor/tracker.py @@ -127,7 +127,7 @@ def need_guard(self) -> bool: return False -class SymbolicOperationTracker(DummyTracker): +class SymbolicOperationTracker(Tracker): """ SymbolicOperationTracker is a subclass of Tracker that specifically tracks variables cannot be reproduced from the frame. It is mostly generated by complex operations of symbolic variables. @@ -151,6 +151,14 @@ def trace_value_from_frame(self): def __repr__(self) -> str: return f"SymbolicOperationTracker(num_inputs={len(self.inputs)})" + def is_traceable(self): + # TODO(zrr1999): to implement gen_instructions and trace_value_from_frame + return False + + def need_guard(self) -> bool: + # TODO(zrr1999): to implement gen_instructions and trace_value_from_frame + return False + class DanglingTracker(Tracker): """ diff --git a/python/paddle/jit/sot/opcode_translator/executor/variables/basic.py b/python/paddle/jit/sot/opcode_translator/executor/variables/basic.py index 965b7edba28ed..ffec4b1485cb6 100644 --- a/python/paddle/jit/sot/opcode_translator/executor/variables/basic.py +++ b/python/paddle/jit/sot/opcode_translator/executor/variables/basic.py @@ -90,6 +90,8 @@ core.DataType.BOOL: "bool", } +STATIC_DIM_FREQ_THRESHOLD = 5 + class ConstantVariable(VariableBase): """ @@ -174,24 +176,6 @@ def chr(self): DummyTracker([self]), ) - @check_guard - def make_stringify_guard(self) -> list[StringifyExpression]: - if ( - ENV_SOT_ALLOW_DYNAMIC_SHAPE.get() - and isinstance(self.value, int) - and self.tracker.need_guard() - ): - from ..executor_cache import OpcodeExecutorCache - - frame_value_tracer = self.tracker.trace_value_from_frame() - symbolic_inputs = OpcodeExecutorCache().symbolic_inputs - symbolic_inputs.setdefault(frame_value_tracer.inlined_expr, {}) - symbolic_input = symbolic_inputs[frame_value_tracer.inlined_expr] - symbolic_input.setdefault(self.value, 0) - symbolic_input[self.value] += 1 - - return super().make_stringify_guard() - @VariableFactory.register_from_value() def from_value(value: Any, graph: FunctionGraph, tracker: Tracker): if type(value) in ConstTypes: @@ -349,10 +333,24 @@ def __init__( raise InnerError( f"Required type(tensor) is paddle.Tensor or ProxyTensor, but received {type(tensor).__name__}." ) + dynamic_axes: list[int] = [] + if ENV_SOT_ALLOW_DYNAMIC_SHAPE.get() and self.tracker.is_traceable(): + dynamic_axes = self.analyse_dynamic_axes() + self.meta.dynamic_axes = dynamic_axes self.origin_meta = self.meta self.var_name = TensorVariable.var_name_generator.next() self.graph.side_effects.record_mutable_variable(self) + def analyse_dynamic_axes(self): + shape_dims = ( + self.shape.proxy.get_all() + ) # Trigger convert all shape dims to Variable + return [ + i + for i, dim in enumerate(shape_dims) + if isinstance(dim, SymbolicVariable) + ] + def __len__(self): if self.meta.shape[0] == -1: raise BreakGraphError( @@ -399,9 +397,13 @@ def _reconstruct(self, codegen: PyCodeGen): def make_stringify_guard(self) -> list[StringifyExpression]: frame_value_tracer = self.tracker.trace_value_from_frame() + if ENV_SOT_ALLOW_DYNAMIC_SHAPE.get(): + str_left_expr = f"MetaInfo.from_tensor({{}}, dynamic_axes={self.meta.dynamic_axes}).guard_str()" + else: + str_left_expr = "MetaInfo.from_tensor({}).guard_str()" return [ StringifyExpression( - f"MetaInfo.from_tensor({{}}).guard_str() == '{self.origin_meta.guard_str()}'", + f"{str_left_expr} == '{self.origin_meta.guard_str()}'", [frame_value_tracer], union_free_vars( {"MetaInfo": MetaInfo}, @@ -483,15 +485,15 @@ def size(self): @tensor_property def shape(self): + # TODO(zrr1999): support more tensor properties if self.meta.is_dynamic_shape(): raise BreakGraphError( f"Getting shape for a dynamic shape tensor causes graph break. shape = {self.meta.shape}" ) from .container import ListVariable - return ListVariable( - self.meta.shape, self.graph, tracker=DummyTracker([self]) - ) + tracker = GetAttrTracker(self, "shape") + return ListVariable(self.meta.shape, self.graph, tracker=tracker) def numel(self): return self.size @@ -605,7 +607,7 @@ class SymbolicVariable(VariableBase): def __init__( self, - value: int | MetaInfo, + value: int | None | MetaInfo, graph: FunctionGraph, tracker: Tracker, ): @@ -663,7 +665,9 @@ def make_stringify_guard(self) -> list[StringifyExpression]: from ..executor_cache import OpcodeExecutorCache frame_value_tracer = self.tracker.trace_value_from_frame() - symbolic_inputs = OpcodeExecutorCache().symbolic_inputs + symbolic_inputs = OpcodeExecutorCache().get_symbolic_inputs( + self.graph.pycode_gen._origin_code + ) assert frame_value_tracer.inlined_expr in symbolic_inputs @@ -681,25 +685,42 @@ def make_stringify_guard(self) -> list[StringifyExpression]: ) ] + @staticmethod + def should_create_symbolic_variable( + value: Any, tracker: Tracker, symbolic_inputs: dict[str, dict[int, int]] + ): + tracker_expr = tracker.trace_value_from_frame().inlined_expr + symbolic_inputs.setdefault(tracker_expr, {}) + for expr, symbolic_input in symbolic_inputs.items(): + if tracker.match_expr(expr): + symbolic_input.setdefault(value, 0) + symbolic_input[value] += 1 + if symbolic_input[value] >= STATIC_DIM_FREQ_THRESHOLD: + return False + if len(symbolic_input.keys()) > 1: + return True + return False + return False + @VariableFactory.register_from_value(successor="ConstantVariable") def from_value(value: Any, graph: FunctionGraph, tracker: Tracker): if not ENV_SOT_ALLOW_DYNAMIC_SHAPE.get(): - return + return None if not isinstance(value, int): - return - if not tracker.need_guard(): - return + return None + if not tracker.is_traceable(): + return None from ..executor_cache import OpcodeExecutorCache - symbolic_inputs = OpcodeExecutorCache().symbolic_inputs + symbolic_inputs = OpcodeExecutorCache().get_symbolic_inputs( + graph.pycode_gen._origin_code + ) - for tracker_expr, symbolic_input in symbolic_inputs.items(): - if tracker.match_expr(tracker_expr): - symbolic_input.setdefault(value, 0) - symbolic_input[value] += 1 - # TODO(zrr1999): determine frequency - return SymbolicVariable(value, graph, tracker) + if SymbolicVariable.should_create_symbolic_variable( + value, tracker, symbolic_inputs + ): + return SymbolicVariable(value, graph, tracker) return None diff --git a/python/paddle/jit/sot/symbolic/compile_cache.py b/python/paddle/jit/sot/symbolic/compile_cache.py index b697e721532f9..5cb06059bb3db 100644 --- a/python/paddle/jit/sot/symbolic/compile_cache.py +++ b/python/paddle/jit/sot/symbolic/compile_cache.py @@ -21,6 +21,7 @@ from paddle.amp.auto_cast import amp_state from paddle.base.data_feeder import convert_dtype from paddle.framework import _dygraph_tracer, use_pir_api +from paddle.static import InputSpec from ..infer_meta import convert_meta_to_input_spec from ..profiler import EventGuard @@ -162,7 +163,13 @@ class CompileSIRCache(Cache, metaclass=Singleton): def __init__(self): super().__init__(weak=False) - def key_fn(self, context: SymbolicTraceContext, sir_name: str, **kwargs): + def key_fn( + self, + context: SymbolicTraceContext, + sir_name: str, + input_spec: list[InputSpec], + **kwargs, + ): """ generate a hash key for a SIR @@ -176,10 +183,16 @@ def key_fn(self, context: SymbolicTraceContext, sir_name: str, **kwargs): """ sir = context.get_sir(sir_name) # NOTE(dev): Is str(sir) a heavy operation ? - hash_key = hash((str(sir), kwargs['training'])) + hash_key = hash((str(sir), *input_spec, kwargs['training'])) return hash_key - def value_fn(self, context: SymbolicTraceContext, sir_name: str, **kwargs): + def value_fn( + self, + context: SymbolicTraceContext, + sir_name: str, + input_spec: list[InputSpec], + **kwargs, + ): """ Generate static graph function @@ -196,6 +209,7 @@ def value_fn(self, context: SymbolicTraceContext, sir_name: str, **kwargs): return FallbackWrapper( paddle.jit.to_static( compile_sir(context, sir_name), + input_spec=[input_spec], build_strategy=build_strategy, backend=backend, full_graph=True, diff --git a/python/paddle/jit/sot/symbolic/symbolic_context.py b/python/paddle/jit/sot/symbolic/symbolic_context.py index cc6487f696d0a..4efe3038c2781 100644 --- a/python/paddle/jit/sot/symbolic/symbolic_context.py +++ b/python/paddle/jit/sot/symbolic/symbolic_context.py @@ -14,6 +14,10 @@ from __future__ import annotations +from typing import Any, Callable + +from paddle.static import InputSpec + from ..utils import log from .compile_cache import CompileSIRCache from .statement_ir import ( @@ -126,7 +130,15 @@ def replace_TOS(self, sir): self.sir_stack.append(sir) self.statement_factory.update(sir) - def compile_do_nothing(self, ret_vals): + def return_TOS(self, ret_vals): + cur_sir: StatementIR = self.TOS + cur_sir.inputs = cur_sir.analyse_inputs() + cur_sir.outputs = ret_vals + log(2, "start subgraph compile and execution.\n") + log(2, self.TOS, "\n") + return cur_sir + + def compile_do_nothing(self) -> Callable[[...], Any]: """ Return a dummy function, which will return an empty list. @@ -141,29 +153,12 @@ def __call__(*args, **kwargs): def graph_size(self): return 0 - # return None function - dummy_stmt_ir = StatementIR("dummy_func") - dummy_stmt_ir.outputs = [] - dummy_stmt_ir.inputs = [] - return DummyFunc(), dummy_stmt_ir + return DummyFunc() - def compile_fn(self, ret_vals, **kwargs): + def compile_fn(self, sir_name: str, input_spec: list[InputSpec], **kwargs): """ start compile and return the python function, which must can be to_static without errors. """ - cur_sir: StatementIR = self.TOS - # step0: if no statement, return a dummy function - if len(cur_sir.statements) == 0: - return self.compile_do_nothing(ret_vals) - # step1: analyse sir inputs and outputs - cur_sir.inputs = cur_sir.analyse_inputs() - # TODO: output analysis - cur_sir.outputs = ret_vals - log(2, "start subgraph compile and execution.\n") - log(2, self.TOS, "\n") - # step2: call compile_sir and get python function, third cache is triggered here. - static_func = CompileSIRCache()(self, cur_sir.name, **kwargs) - # step3: GC and reset TOS - # self.reset_TOS() + static_func = CompileSIRCache()(self, sir_name, input_spec, **kwargs) - return static_func, cur_sir + return static_func diff --git a/python/paddle/jit/translated_layer.py b/python/paddle/jit/translated_layer.py index ddf0cf9c8b02e..c281e335efb3d 100644 --- a/python/paddle/jit/translated_layer.py +++ b/python/paddle/jit/translated_layer.py @@ -892,8 +892,7 @@ def _run_dygraph(instance, input, program_holder): for i, value in enumerate(input): if not isinstance(value, (np.ndarray, core.eager.Tensor)): raise TypeError( - "The type of input in TranslatedLayer must be numpy array or Variable(Tensor), but received %s." - % type(value) + f"The type of input in TranslatedLayer must be numpy array or Variable(Tensor), but received {type(value)}." ) # NOTE: In order to unify the API, firstly convert the input to Tensor if isinstance(value, np.ndarray): @@ -925,8 +924,7 @@ def _run_dygraph(instance, input, program_holder): persistable_vars.append(instance._buffers[dy_var_name]) else: raise ValueError( - "The persistable variable %s does not exist in current TranslatedLayer." - % var_name + f"The persistable variable {var_name} does not exist in current TranslatedLayer." ) output_vars = [] @@ -1426,7 +1424,7 @@ def _construct(model_path, configs=None): # 0. dir and filename check model_path = os.path.normpath(model_path) if not os.path.isdir(model_path): - raise ValueError("There is no directory named '%s'" % model_path) + raise ValueError(f"There is no directory named '{model_path}'") model_filename = None params_filename = None if configs is not None: @@ -1591,8 +1589,7 @@ def _get_program_holder(self, method_name='forward'): program_holder = self._program_holder_dict.get(method_name, None) if program_holder is None: raise ValueError( - "The method `%s` does not exist in loaded TranslatedLayer." - % method_name + f"The method `{method_name}` does not exist in loaded TranslatedLayer." ) return program_holder diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py index a9d8312bb4ca0..42793a6496ad3 100644 --- a/python/paddle/nn/__init__.py +++ b/python/paddle/nn/__init__.py @@ -69,7 +69,9 @@ Upsample, UpsamplingBilinear2D, UpsamplingNearest2D, + ZeroPad1D, ZeroPad2D, + ZeroPad3D, ) # TODO: import all neural network related api under this directory, @@ -300,4 +302,6 @@ 'Unflatten', 'FractionalMaxPool2D', 'FractionalMaxPool3D', + 'ZeroPad1D', + 'ZeroPad3D', ] diff --git a/python/paddle/nn/initializer/__init__.py b/python/paddle/nn/initializer/__init__.py index e281d6cd48589..270f0bb9234ea 100644 --- a/python/paddle/nn/initializer/__init__.py +++ b/python/paddle/nn/initializer/__init__.py @@ -18,7 +18,7 @@ Assign, NumpyArrayInitializer, # noqa: F401 ) -from .Bilinear import Bilinear +from .bilinear import Bilinear from .constant import ( Constant, ConstantInitializer, # noqa: F401 diff --git a/python/paddle/nn/initializer/Bilinear.py b/python/paddle/nn/initializer/bilinear.py similarity index 100% rename from python/paddle/nn/initializer/Bilinear.py rename to python/paddle/nn/initializer/bilinear.py diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py index 6faf07bb6eb19..6b34c9fa90f6b 100644 --- a/python/paddle/nn/layer/common.py +++ b/python/paddle/nn/layer/common.py @@ -1077,6 +1077,67 @@ def extra_repr(self): return f'padding={self._pad}, mode={self._mode}, value={self._value}, data_format={self._data_format}{name_str}' +class ZeroPad1D(Layer): + """ + This interface is used to construct a callable object of the ``ZeroPad1D`` class. + Pads the input tensor boundaries with zero. + + Parameters: + padding (Tensor | List[int] | int): The padding size with data type int. If is int, use the + same padding in all dimensions. Else [len(padding)/2] dimensions of input will be padded. + The pad has the form (pad_left, pad_right). + data_format (str): An string from: "NCL", "NLC". Specify the data format of the input data. + Default is "NCL" + name (str, optional) : The default value is None. Normally there is no need for + user to set this property. For more information, please refer to :ref:`api_guide_Name`. + + Shape: + - x(Tensor): The input tensor of zeropad1d operator, which is a 3-D tensor. + The data type can be float32, float64. + - output(Tensor): The output tensor of zeropad1d operator, which is a 3-D tensor. + The data type is same as input x. + + Examples: + + .. code-block:: python + + >>> import paddle + >>> import paddle.nn as nn + + >>> input_shape = (1, 2, 3) + >>> pad = [1, 2] + >>> data = paddle.arange(paddle.prod(paddle.to_tensor(input_shape)), dtype="float32").reshape(input_shape) + 1 + >>> my_pad = nn.ZeroPad1D(padding=pad) + >>> result = my_pad(data) + >>> print(result) + Tensor(shape=[1, 2, 6], dtype=float32, place=Place(cpu), stop_gradient=True, + [[[0., 1., 2., 3., 0., 0.], + [0., 4., 5., 6., 0., 0.]]]) + """ + + def __init__(self, padding, data_format="NCL", name=None): + super().__init__() + self._pad = _npairs(padding, 1) + self._mode = 'constant' + self._value = 0.0 + self._data_format = data_format + self._name = name + + def forward(self, x): + return F.pad( + x, + pad=self._pad, + mode=self._mode, + value=self._value, + data_format=self._data_format, + name=self._name, + ) + + def extra_repr(self): + name_str = f', name={self._name}' if self._name else '' + return f'padding={self._pad}, data_format={self._data_format}{name_str}' + + class Pad2D(Layer): """ This interface is used to construct a callable object of the ``Pad2D`` class. @@ -1290,6 +1351,70 @@ def extra_repr(self): return f'padding={self._pad}, mode={self._mode}, value={self._value}, data_format={self._data_format}{name_str}' +class ZeroPad3D(Layer): + """ + This interface is used to construct a callable object of the ``ZeroPad3D`` class. + Pads the input tensor boundaries with zero. + + Parameters: + padding (Tensor | List[int] | int): The padding size with data type int. If is int, use the + same padding in all dimensions. Else [len(padding)/2] dimensions of input will be padded. + The pad has the form (pad_left, pad_right, pad_top, pad_bottom, pad_front, pad_back). + data_format (str): An string from: "NCDHW", "NDHWC". Specify the data format of the input data. + Default is "NCDHW" + name (str, optional) : The default value is None. Normally there is no need for + user to set this property. For more information, please refer to :ref:`api_guide_Name`. + + Shape: + - x(Tensor): The input tensor of zeropad3d operator, which is a 5-D tensor. + The data type can be float32, float64. + - output(Tensor): The output tensor of zeropad3d operator, which is a 5-D tensor. + The data type is same as input x. + + Examples: + + .. code-block:: python + + >>> import paddle + >>> import paddle.nn as nn + + >>> input_shape = (1, 1, 1, 2, 3) + >>> pad = [1, 0, 1, 2, 0, 0] + >>> data = paddle.arange(paddle.prod(paddle.to_tensor(input_shape)), dtype="float32").reshape(input_shape) + 1 + >>> my_pad = nn.ZeroPad3D(padding=pad) + >>> result = my_pad(data) + >>> print(result) + Tensor(shape=[1, 1, 1, 5, 4], dtype=float32, place=Place(cpu), stop_gradient=True, + [[[[[0., 0., 0., 0.], + [0., 1., 2., 3.], + [0., 4., 5., 6.], + [0., 0., 0., 0.], + [0., 0., 0., 0.]]]]]) + """ + + def __init__(self, padding, data_format="NCDHW", name=None): + super().__init__() + self._pad = _npairs(padding, 3) + self._mode = 'constant' + self._value = 0.0 + self._data_format = data_format + self._name = name + + def forward(self, x): + return F.pad( + x, + pad=self._pad, + mode=self._mode, + value=self._value, + data_format=self._data_format, + name=self._name, + ) + + def extra_repr(self): + name_str = f', name={self._name}' if self._name else '' + return f'padding={self._pad}, data_format={self._data_format}{name_str}' + + class CosineSimilarity(Layer): """ This interface is used to compute cosine similarity between x1 and x2 along axis. diff --git a/python/paddle/nn/quant/quantized_linear.py b/python/paddle/nn/quant/quantized_linear.py index 1c2d962f720cf..41ad1839e1f8a 100644 --- a/python/paddle/nn/quant/quantized_linear.py +++ b/python/paddle/nn/quant/quantized_linear.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from paddle import _C_ops, version +import paddle +from paddle import _C_ops from paddle.base.data_feeder import check_dtype from paddle.base.framework import convert_np_dtype_to_dtype_ from paddle.device.cuda import get_device_capability @@ -24,7 +25,7 @@ def _get_arch_info(): # Get SMVersion from device. - cuda_version = version.cuda() + cuda_version = paddle.version.cuda() if cuda_version is not None and cuda_version != 'False': major, minor = get_device_capability() arch = int(major * 10 + minor) diff --git a/python/paddle/static/io.py b/python/paddle/static/io.py index 469145ac6a832..e508cbdd43c19 100644 --- a/python/paddle/static/io.py +++ b/python/paddle/static/io.py @@ -63,6 +63,7 @@ load_pir, load_pir_inference_model, load_vars_pir, + normalize_pir_program, save_pir, save_pir_inference_model, save_vars_pir, @@ -183,6 +184,8 @@ def normalize_program(program, feed_vars, fetch_vars, **kwargs): >>> normalized_program = paddle.static.normalize_program(program, [image], [predict]) """ + if in_pir_mode(): + return normalize_pir_program(program, feed_vars, fetch_vars, **kwargs) if not isinstance(program, Program): raise TypeError( "program type must be `base.Program`, but received `%s`" diff --git a/python/paddle/static/nn/metric.py b/python/paddle/static/nn/metric.py index d2252ebc0a0bc..2be2cecf18742 100644 --- a/python/paddle/static/nn/metric.py +++ b/python/paddle/static/nn/metric.py @@ -245,6 +245,28 @@ def auc( [array(1.)] """ + if in_pir_mode(): + if ins_tag_weight is None: + ins_tag_weight = paddle.full( + shape=[1, 1], dtype="float32", fill_value=1.0 + ) + stat_pos = paddle.zeros(shape=[1, num_thresholds + 1], dtype="int64") + stat_neg = paddle.zeros(shape=[1, num_thresholds + 1], dtype="int64") + auc_out, batch_stat_pos, batch_stat_neg = _C_ops.auc( + input, + label, + stat_pos, + stat_neg, + ins_tag_weight, + curve, + num_thresholds, + slide_steps, + ) + return ( + auc_out, + batch_stat_pos, + batch_stat_neg, + ) helper = LayerHelper("auc", **locals()) if ins_tag_weight is None: diff --git a/python/paddle/static/pir_io.py b/python/paddle/static/pir_io.py index 38e5e69cfdbb1..bd9b5305b7696 100644 --- a/python/paddle/static/pir_io.py +++ b/python/paddle/static/pir_io.py @@ -251,7 +251,13 @@ def normalize_pir_program(program, feed_vars, fetch_vars, **kwargs): if not all(isinstance(v, pir.Value) for v in fetch_vars): raise TypeError("fetch_vars type must be a Value or a list of Value.") - # TODO(Ruting) remind users to set auc_states to 0 if auc op were found. + # remind users to set auc_states to 0 if auc op were found. + for op in program.global_block().ops: + if op.name() == 'pd_op.auc': + warnings.warn( + "Be sure that you have set auc states to 0 before saving inference model." + ) + break # fix the bug that the activation op's output as target will be pruned. # will affect the inference performance. diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py index 4de5e392a8493..553ea2cc5bbee 100644 --- a/python/paddle/tensor/__init__.py +++ b/python/paddle/tensor/__init__.py @@ -142,6 +142,7 @@ atleast_1d, atleast_2d, atleast_3d, + block_diag, broadcast_tensors, broadcast_to, cast, @@ -306,6 +307,7 @@ inner, inverse, isfinite, + isin, isinf, isnan, isneginf, @@ -544,6 +546,7 @@ 'hypot_', 'nansum', 'nanmean', + 'block_diag', 'count_nonzero', 'tanh', 'tanh_', @@ -587,6 +590,7 @@ 'kron', 'kthvalue', 'isfinite', + 'isin', 'isinf', 'isnan', 'isneginf', diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py index 24c60af7499e6..9f0d808a8b2b4 100644 --- a/python/paddle/tensor/creation.py +++ b/python/paddle/tensor/creation.py @@ -1644,7 +1644,7 @@ def meshgrid(*args, **kwargs): Args: *args(Tensor|list of Tensor) : tensors (tuple(list) of tensor): the shapes of input k tensors are (N1,), - (N2,),..., (Nk,). Support data types: ``float64``, ``float16``, ``float32``, ``int32``, ``int64``. + (N2,),..., (Nk,). Support data types: ``float64``, ``bfloat16``, ``float16``, ``float32``, ``int32``, ``int64``, ``complex64``, ``complex128``. **kwargs (optional): Currently, only accept name in **kwargs The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. @@ -1686,7 +1686,16 @@ def meshgrid(*args, **kwargs): check_dtype( input_.dtype, 'create data type', - ['uint16', 'float16', 'float32', 'float64', 'int32', 'int64'], + [ + 'uint16', + 'float16', + 'float32', + 'float64', + 'int32', + 'int64', + 'complex64', + 'complex128', + ], 'meshgrid', ) diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index 9803d4a8c5c0a..2ba51595cc94d 100644 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -3971,7 +3971,7 @@ def tile(x, repeat_times, name=None): Both the number of dimensions of ``x`` and the number of elements in ``repeat_times`` should be less than or equal to 6. Args: - x (Tensor): The input tensor, its data type should be bool, float16, float32, float64, int32 or int64. + x (Tensor): The input tensor, its data type should be bool, float16, float32, float64, int32, int64, complex64 or complex128. repeat_times (list|tuple|Tensor): The number of repeating times. If repeat_times is a list or tuple, all its elements should be integers or 1-D Tensors with the data type int32. If repeat_times is a Tensor, it should be an 1-D Tensor with the data type int32. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. @@ -4038,6 +4038,8 @@ def check_input(x, repeat_times): 'float64', 'int32', 'int64', + 'complex64', + 'complex128', ], 'tile', ) @@ -4209,7 +4211,7 @@ def expand(x, shape, name=None): Both the number of dimensions of ``x`` and the number of elements in ``shape`` should be less than or equal to 6. And the number of dimensions of ``x`` should be less than the number of elements in ``shape``. The dimension to expand must have a value 0. Args: - x (Tensor): The input Tensor, its data type is bool, float16, float32, float64, int32, int64, uint8 or uint16. + x (Tensor): The input Tensor, its data type is bool, float16, float32, float64, int32, int64, uint8, uint16, complex64 or complex128. shape (list|tuple|Tensor): The result shape after expanding. The data type is int32. If shape is a list or tuple, all its elements should be integers or 0-D or 1-D Tensors with the data type int32. If shape is a Tensor, it should be an 1-D Tensor with the data type int32. The value -1 in shape means keeping the corresponding dimension unchanged. @@ -4275,6 +4277,8 @@ def expand(x, shape, name=None): 'int64', 'uint8', 'uint16', + 'complex64', + 'complex128', ], 'expand', ) @@ -6861,3 +6865,67 @@ def slice_scatter(x, value, axes, starts, ends, strides, name=None): ) return output + + +def block_diag(inputs, name=None): + """ + Create a block diagonal matrix from provided tensors. + + Args: + inputs (list|tuple): ``inputs`` is a Tensor list or Tensor tuple, one or more tensors with 0, 1, or 2 dimensions. + name (str, optional): Name for the operation (optional, default is None). + + Returns: + Tensor, A ``Tensor``. The data type is same as ``inputs``. + + Examples: + .. code-block:: python + + >>> import paddle + + >>> A = paddle.to_tensor([[4], [3], [2]]) + >>> B = paddle.to_tensor([7, 6, 5]) + >>> C = paddle.to_tensor(1) + >>> D = paddle.to_tensor([[5, 4, 3], [2, 1, 0]]) + >>> E = paddle.to_tensor([[8, 7], [7, 8]]) + >>> out = paddle.block_diag([A, B, C, D, E]) + >>> print(out) + Tensor(shape=[9, 10], dtype=int64, place=Place(gpu:0), stop_gradient=True, + [[4, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [3, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [2, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 7, 6, 5, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 1, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 5, 4, 3, 0, 0], + [0, 0, 0, 0, 0, 2, 1, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 8, 7], + [0, 0, 0, 0, 0, 0, 0, 0, 7, 8]]) + """ + + def to_col_block(arys, i, a): + return [ + a + if idx == i + else paddle.zeros([ary.shape[0], a.shape[1]], dtype=a.dtype) + for idx, ary in enumerate(arys) + ] + + def to_2d(ary): + if ary.ndim == 0: + return ary.unsqueeze(axis=0).unsqueeze(axis=0) + if ary.ndim == 1: + return ary.unsqueeze(axis=0) + if ary.ndim == 2: + return ary + raise ValueError( + "For 'block_diag', the dimension of each elements in 'inputs' must be 0, 1, or 2, but got " + f"{ary.ndim}" + ) + + arys = [to_2d(ary) for ary in inputs] + + matrix = [ + paddle.concat(to_col_block(arys, idx, ary), axis=0) + for idx, ary in enumerate(arys) + ] + return paddle.concat(matrix, axis=1) diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index d7d8669ff0c3b..3df4cf88c94b6 100644 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -2726,7 +2726,7 @@ def inverse(x, name=None): x (Tensor): The input tensor. The last two dimensions should be equal. When the number of dimensions is greater than 2, it is treated as batches of square matrix. The data - type can be float32 and float64. + type can be float32, float64, complex64, complex128. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. Returns: @@ -2751,7 +2751,12 @@ def inverse(x, name=None): else: def _check_input(x): - check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'inverse') + check_variable_and_dtype( + x, + 'x', + ['float32', 'float64', 'complex64', 'complex128'], + 'inverse', + ) if len(x.shape) < 2: raise ValueError( "The input of inverse is expected to be a Tensor whose number " @@ -7969,3 +7974,187 @@ def sinc_(x, name=None): paddle.sin_(x) paddle.divide_(x, tmp) return paddle.where(~paddle.isnan(x), x, paddle.full_like(x, 1.0)) + + +def isin(x, test_x, assume_unique=False, invert=False, name=None): + r""" + Tests if each element of `x` is in `test_x`. + + Args: + x (Tensor): The input Tensor. Supported data type: 'bfloat16', 'float16', 'float32', 'float64', 'int32', 'int64'. + test_x (Tensor): Tensor values against which to test for each input element. Supported data type: 'bfloat16', 'float16', 'float32', 'float64', 'int32', 'int64'. + assume_unique (bool, optional): If True, indicates both `x` and `test_x` contain unique elements, which could make the calculation faster. Default: False. + invert (bool, optional): Indicate whether to invert the boolean return tensor. If True, invert the results. Default: False. + name (str, optional): Name for the operation (optional, default is None).For more information, please refer to :ref:`api_guide_Name`. + + Returns: + out (Tensor), The output Tensor with the same shape as `x`. + + Examples: + .. code-block:: python + + >>> import paddle + >>> paddle.set_device('cpu') + >>> x = paddle.to_tensor([-0., -2.1, 2.5, 1.0, -2.1], dtype='float32') + >>> test_x = paddle.to_tensor([-2.1, 2.5], dtype='float32') + >>> res = paddle.isin(x, test_x) + >>> print(res) + Tensor(shape=[5], dtype=bool, place=Place(cpu), stop_gradient=True, + [False, True, True, False, True]) + + >>> x = paddle.to_tensor([-0., -2.1, 2.5, 1.0, -2.1], dtype='float32') + >>> test_x = paddle.to_tensor([-2.1, 2.5], dtype='float32') + >>> res = paddle.isin(x, test_x, invert=True) + >>> print(res) + Tensor(shape=[5], dtype=bool, place=Place(cpu), stop_gradient=True, + [True, False, False, True, False]) + + >>> # Set `assume_unique` to True only when `x` and `test_x` contain unique values, otherwise the result may be incorrect. + >>> x = paddle.to_tensor([0., 1., 2.]*20).reshape([20, 3]) + >>> test_x = paddle.to_tensor([0., 1.]*20) + >>> correct_result = paddle.isin(x, test_x, assume_unique=False) + >>> print(correct_result) + Tensor(shape=[20, 3], dtype=bool, place=Place(cpu), stop_gradient=True, + [[True , True , False], + [True , True , False], + [True , True , False], + [True , True , False], + [True , True , False], + [True , True , False], + [True , True , False], + [True , True , False], + [True , True , False], + [True , True , False], + [True , True , False], + [True , True , False], + [True , True , False], + [True , True , False], + [True , True , False], + [True , True , False], + [True , True , False], + [True , True , False], + [True , True , False], + [True , True , False]]) + + >>> incorrect_result = paddle.isin(x, test_x, assume_unique=True) + >>> print(incorrect_result) + Tensor(shape=[20, 3], dtype=bool, place=Place(gpu:0), stop_gradient=True, + [[True , True , True ], + [True , True , True ], + [True , True , True ], + [True , True , True ], + [True , True , True ], + [True , True , True ], + [True , True , True ], + [True , True , True ], + [True , True , True ], + [True , True , True ], + [True , True , True ], + [True , True , True ], + [True , True , True ], + [True , True , True ], + [True , True , True ], + [True , True , True ], + [True , True , True ], + [True , True , True ], + [True , True , True ], + [True , True , False]]) + + """ + if not isinstance(x, (paddle.Tensor, Variable, paddle.pir.Value)): + raise TypeError(f"x must be tensor type, but got {type(x)}") + if not isinstance(test_x, (paddle.Tensor, Variable, paddle.pir.Value)): + raise TypeError(f"x must be tensor type, but got {type(test_x)}") + + check_variable_and_dtype( + x, + "x", + [ + 'uint16', + 'float16', + 'float32', + 'float64', + 'int32', + 'int64', + ], + "isin", + ) + + check_variable_and_dtype( + test_x, + "test_x", + [ + 'uint16', + 'float16', + 'float32', + 'float64', + 'int32', + 'int64', + ], + "isin", + ) + + x_zero_dim = False + if len(x.shape) == 0: + x = x.reshape([1]) + x_zero_dim = True + + size_x = math.prod(x.shape) + size_t = math.prod(test_x.shape) + if size_t < math.pow(size_x, 0.145) * 10.0: + # use brute-force searching if the test_x size is small + if len(x.shape) == 0: + return paddle.zeros([], dtype='bool') + + tmp = x.reshape(tuple(x.shape) + ((1,) * test_x.ndim)) + cmp = tmp == test_x + dim = tuple(range(-1, -test_x.ndim - 1, -1)) + cmp = cmp.any(axis=dim) + if invert: + cmp = ~cmp + else: + x_flat = x.flatten() + test_x_flat = test_x.flatten() + if assume_unique: + # if x and test_x both contain unique elements, use stable argsort method which could be faster + all_elements = paddle.concat([x_flat, test_x_flat]) + sorted_index = paddle.argsort(all_elements, stable=True) + sorted_x = all_elements[sorted_index] + + duplicate_mask = paddle.full_like(sorted_index, False, dtype='bool') + if not in_dynamic_mode(): + duplicate_mask = paddle.static.setitem( + duplicate_mask, + paddle.arange(duplicate_mask.numel() - 1), + sorted_x[1:] == sorted_x[:-1], + ) + else: + duplicate_mask[:-1] = sorted_x[1:] == sorted_x[:-1] + + if invert: + duplicate_mask = duplicate_mask.logical_not() + + mask = paddle.empty_like(duplicate_mask) + if not in_dynamic_or_pir_mode(): + mask = paddle.static.setitem(mask, sorted_index, duplicate_mask) + else: + mask[sorted_index] = duplicate_mask + + cmp = mask[0 : x.numel()].reshape(x.shape) + else: + # otherwise use searchsorted method + sorted_test_x = paddle.sort(test_x_flat) + idx = paddle.searchsorted(sorted_test_x, x_flat) + test_idx = paddle.where( + idx < sorted_test_x.numel(), + idx, + paddle.zeros_like(idx, 'int64'), + ) + cmp = sorted_test_x[test_idx] == x_flat + cmp = cmp.logical_not() if invert else cmp + cmp = cmp.reshape(x.shape) + + if x_zero_dim: + return cmp.reshape([]) + else: + return cmp diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py index 736ae891f2fb8..9ec4cd1e2ec7f 100755 --- a/python/paddle/tensor/search.py +++ b/python/paddle/tensor/search.py @@ -130,6 +130,7 @@ def argsort(x, axis=-1, descending=False, stable=False, name=None): x, 'x', [ + 'uint16', 'float16', 'float32', 'float64', diff --git a/python/paddle/tensor/tensor.prototype.pyi b/python/paddle/tensor/tensor.prototype.pyi index 735c8da282545..9b011b602b5e3 100644 --- a/python/paddle/tensor/tensor.prototype.pyi +++ b/python/paddle/tensor/tensor.prototype.pyi @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -# The `Tensor` template for `tools/gen_tensor_stub.py` generates the stub file `tensor.pyi`. -# Add docstring, attributes, methods and alias with type annotaions for `Tensor` +# The `Tensor` template `tensor.prototype.pyi` for `tools/gen_tensor_stub.py` to generate the stub file `tensor.pyi`. +# Add docstring, attributes, methods and alias with type annotaions for `Tensor` in `tensor.prototype.pyi` # if not conveniently coding in original place (like c++ source file). from typing import Any, overload diff --git a/python/setup.py.in b/python/setup.py.in index 67d23a089aa37..98ccf8c61e41c 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -1054,6 +1054,36 @@ if '${WITH_STRIP}' == 'ON': if os.system(command) != 0: raise Exception("strip *.so failed, command: %s" % command) + +def check_build_dependency(): + missing_modules = '''Missing build dependency: {dependency} +Please run 'pip install -r python/requirements.txt' to make sure you have all the dependencies installed. +'''.strip() + + with open('${PADDLE_SOURCE_DIR}' + '/python/requirements.txt') as f: + build_dependencies = ( + f.read().splitlines() + ) # Specify the dependencies to install + + python_dependencies_module = [] + installed_packages = [] + + for dependency in build_dependencies: + python_dependencies_module.append( + re.sub("_|-", '', re.sub(r"==.*|>=.*|<=.*", '', dependency)) + ) + reqs = subprocess.check_output([sys.executable, '-m', 'pip', 'freeze']) + + for r in reqs.split(): + installed_packages.append( + re.sub("_|-", '', r.decode().split('==')[0]).lower() + ) + + for dependency in python_dependencies_module: + if dependency.lower() not in installed_packages: + raise RuntimeError(missing_modules.format(dependency=dependency)) + + def install_cpp_dist_and_build_test(paddle_install_dir, paddle_lib_test_dir): """install cpp distribution and build test target @@ -1095,6 +1125,9 @@ def install_cpp_dist_and_build_test(paddle_install_dir, paddle_lib_test_dir): subprocess.check_call(["cmake", "--build", paddle_lib_test_dir]) +# check build dependency +check_build_dependency() + # install cpp distribution if '${WITH_CPP_DIST}' == 'ON': paddle_install_dir = '${PADDLE_INSTALL_DIR}' @@ -1112,6 +1145,28 @@ package_data['paddle.base'] = package_data.get('paddle.base', []) + [ package_data['paddle.tensor'] = package_data.get('paddle.tensor', []) + ['tensor.pyi'] +def generate_tensor_stub(paddle_binary_dir, paddle_source_dir): + print('-'*2, 'Generate stub file tensor.pyi ... ') + script_path = paddle_source_dir + '/tools/' + sys.path.append(script_path) + import gen_tensor_stub + + gen_tensor_stub.generate_stub_file( + input_file=paddle_source_dir + + '/python/paddle/tensor/tensor.prototype.pyi', + output_file=paddle_binary_dir + '/python/paddle/tensor/tensor.pyi', + ) + + shutil.copy( + paddle_binary_dir + '/python/paddle/tensor/tensor.pyi', + paddle_source_dir + '/python/paddle/tensor/tensor.pyi', + ) + print('-'*2, 'End Generate stub file tensor.pyi ... ') + +# generate stub file `tensor.pyi` +generate_tensor_stub('${PADDLE_BINARY_DIR}', '${PADDLE_SOURCE_DIR}') + + with redirect_stdout(): setup(name='${PACKAGE_NAME}', version='${PADDLE_VERSION}', diff --git a/python/setup_cinn.py.in b/python/setup_cinn.py.in index 597e9b9187f6c..aa68da69a9f7c 100644 --- a/python/setup_cinn.py.in +++ b/python/setup_cinn.py.in @@ -220,10 +220,10 @@ if platform.system() == 'Linux' and platform.machine() == 'x86_64': cuda_major_version = version.split('.')[0] except Exception as e: raise ValueError("CUDA not found") - + install_requires.append(PADDLE_CUDA_INSTALL_REQUIREMENTS[cuda_major_version].split("|")) - - + + with redirect_stdout(): setup( diff --git a/python/unittest_py/requirements.txt b/python/unittest_py/requirements.txt index 15cf679177709..40f16161ab71e 100644 --- a/python/unittest_py/requirements.txt +++ b/python/unittest_py/requirements.txt @@ -19,3 +19,4 @@ wandb>=0.13 ; python_version<"3.12" xlsxwriter==3.0.9 xdoctest==1.1.1 ubelt==1.3.3 # just for xdoctest +mypy==1.10.0 diff --git a/setup.py b/setup.py index aab6fe0bcfd82..6d9ce542c6a15 100644 --- a/setup.py +++ b/setup.py @@ -1796,6 +1796,25 @@ def submodules_not_exists_or_empty(folder): sys.exit(1) +def generate_tensor_stub(paddle_binary_dir, paddle_source_dir): + print('-' * 2, 'Generate stub file tensor.pyi ... ') + script_path = paddle_source_dir + '/tools/' + sys.path.append(script_path) + import gen_tensor_stub + + gen_tensor_stub.generate_stub_file( + input_file=paddle_source_dir + + '/python/paddle/tensor/tensor.prototype.pyi', + output_file=paddle_binary_dir + '/python/paddle/tensor/tensor.pyi', + ) + + shutil.copy( + paddle_binary_dir + '/python/paddle/tensor/tensor.pyi', + paddle_source_dir + '/python/paddle/tensor/tensor.pyi', + ) + print('-' * 2, 'End Generate stub file tensor.pyi ... ') + + def main(): # Parse the command line and check arguments before we proceed with building steps and setup parse_input_command(filter_args_list) @@ -1875,6 +1894,9 @@ def main(): package_data['paddle.libs'], ) + # generate stub file `tensor.pyi` + generate_tensor_stub(paddle_binary_dir, paddle_source_dir) + setup( name=package_name, version=paddle_version, diff --git a/test/auto_parallel/hybrid_strategy/pir_reshard_nd_mesh.py b/test/auto_parallel/hybrid_strategy/pir_reshard_nd_mesh.py index 595f58b206193..8cf3f185dcbfc 100644 --- a/test/auto_parallel/hybrid_strategy/pir_reshard_nd_mesh.py +++ b/test/auto_parallel/hybrid_strategy/pir_reshard_nd_mesh.py @@ -13,7 +13,6 @@ # limitations under the License. import os -import unittest import paddle import paddle.distributed as dist @@ -183,14 +182,11 @@ def run_pr_to_rs_case(self): tgt_out_value = (self._mesh.process_ids, [-1, 1, -1], {}) def run_pr_to_ss_case(self): - # [Partial(), Replicate()] --> [Shard(0), Shard(1)] - # raise NotImplementedError - with unittest.TestCase().assertRaises(NotImplementedError): - self.create_program( - [self.BATCH_SIZE, self.SEQ_LEN, self.HIDDEN_SIZE], - [dist.Partial(dist.ReduceType.kRedSum), dist.Replicate()], - [dist.Shard(0), dist.Shard(1)], - ) + self.create_program( + [self.BATCH_SIZE, self.SEQ_LEN, self.HIDDEN_SIZE], + [dist.Partial(dist.ReduceType.kRedSum), dist.Replicate()], + [dist.Shard(0), dist.Shard(1)], + ) def run_ss_to_ss_case(self): # [Shard(0), Shard(1)] --> [Shard(1), Shard(0)] diff --git a/test/auto_parallel/hybrid_strategy/pir_reshard_nd_mesh_cross_mesh.py b/test/auto_parallel/hybrid_strategy/pir_reshard_nd_mesh_cross_mesh.py index 47bfb9a44df06..532426208c1ee 100644 --- a/test/auto_parallel/hybrid_strategy/pir_reshard_nd_mesh_cross_mesh.py +++ b/test/auto_parallel/hybrid_strategy/pir_reshard_nd_mesh_cross_mesh.py @@ -102,7 +102,7 @@ def run_pp_to_rr_case(self): rank_id = dist.get_rank() if rank_id in self._mesh0.process_ids: - assert new_ops_name[-1] == "pd_op.send_v2" + assert new_ops_name[2] == "pd_op.send_v2" else: assert new_ops_name[2] == "pd_op.recv_v2" assert new_ops_name[-2] == "pd_op.c_allreduce_sum_" diff --git a/test/auto_parallel/pir/mlp_demo_3d.py b/test/auto_parallel/pir/mlp_demo_3d.py index 41ac0d25f682a..a743aa218e659 100644 --- a/test/auto_parallel/pir/mlp_demo_3d.py +++ b/test/auto_parallel/pir/mlp_demo_3d.py @@ -118,50 +118,43 @@ def test_to_static_program(self): rank = paddle.distributed.get_rank() ops = dist_program.global_block().ops op_names = [op.name() for op in ops] - if rank < 4: - std_ops = [ - 'pd_op.data', - 'builtin.parameter', - 'pd_op.data', - 'pd_op.relu', - 'pd_op.matmul', - 'pd_op.relu', - 'dist_op.reshard', - 'dist_op.reshard', - 'pd_op.relu_grad', - 'pd_op.matmul_grad', - 'dist_op.reshard', - 'dist_op.reshard', - 'pd_op.relu_grad', - 'pd_op.sgd_', - ] - else: - std_ops = [ - 'pd_op.data', - 'builtin.parameter', - 'pd_op.data', - 'dist_op.reshard', - 'pd_op.matmul', - 'dist_op.reshard', - 'pd_op.relu', - 'pd_op.subtract', - 'pd_op.square', - 'pd_op.mean', - 'builtin.shadow_output', - 'pd_op.full', - 'pd_op.full_like', - 'dist_op.reshard', - 'pd_op.mean_grad', - 'dist_op.reshard', - 'pd_op.square_grad', - 'pd_op.subtract_grad', - 'pd_op.relu_grad', - 'pd_op.matmul_grad', - 'dist_op.reshard', - 'dist_op.reshard', - 'pd_op.sgd_', - ] - + std_ops = [ + 'pd_op.data', + 'pd_op.data', + 'builtin.parameter', + 'builtin.parameter', + 'pd_op.data', + 'pd_op.data', + 'pd_op.relu', + 'pd_op.matmul', + 'pd_op.relu', + 'dist_op.reshard', + 'pd_op.matmul', + 'dist_op.reshard', + 'pd_op.relu', + 'pd_op.subtract', + 'pd_op.square', + 'pd_op.mean', + 'builtin.shadow_output', + 'pd_op.full', + 'pd_op.full_like', + 'dist_op.reshard', + 'pd_op.mean_grad', + 'dist_op.reshard', + 'pd_op.square_grad', + 'pd_op.subtract_grad', + 'pd_op.relu_grad', + 'pd_op.matmul_grad', + 'dist_op.reshard', + 'dist_op.reshard', + 'pd_op.relu_grad', + 'pd_op.matmul_grad', + 'dist_op.reshard', + 'dist_op.reshard', + 'pd_op.relu_grad', + 'pd_op.sgd_', + 'pd_op.sgd_', + ] assert op_names == std_ops def test_loss_value(self): diff --git a/test/auto_parallel/pir/pir_reshard_s_to_r.py b/test/auto_parallel/pir/pir_reshard_s_to_r.py index 933eb855730ea..1d4afcddf0d64 100644 --- a/test/auto_parallel/pir/pir_reshard_s_to_r.py +++ b/test/auto_parallel/pir/pir_reshard_s_to_r.py @@ -81,7 +81,7 @@ def run_pir_test_case(self): std_ops, ) elif self._shard == 1: - np.testing.assert_equal(main_program.num_ops(), 10) + np.testing.assert_equal(main_program.num_ops(), 8) std_ops = [ 'builtin.parameter', 'pd_op.data', @@ -89,9 +89,7 @@ def run_pir_test_case(self): 'pd_op.c_allgather', 'pd_op.full', 'pd_op.split_with_num', - 'builtin.split', 'pd_op.full', - 'builtin.combine', 'pd_op.concat', ] diff --git a/test/auto_parallel/pir/pir_reshard_s_to_r_cross_mesh.py b/test/auto_parallel/pir/pir_reshard_s_to_r_cross_mesh.py index 771fbf29491ba..6b2fab19e2dab 100644 --- a/test/auto_parallel/pir/pir_reshard_s_to_r_cross_mesh.py +++ b/test/auto_parallel/pir/pir_reshard_s_to_r_cross_mesh.py @@ -65,12 +65,14 @@ def run_pir_test_case(self): ops = [op.name() for op in main_program.global_block().ops] if self._shard == 0: if paddle.distributed.get_rank() == 0: - np.testing.assert_equal(main_program.num_ops(), 4) + np.testing.assert_equal(main_program.num_ops(), 6) std_ops = [ 'builtin.parameter', 'pd_op.data', 'dist_op.shard_tensor', 'pd_op.send_v2', + 'dist_op.reshard', + 'pd_op.c_allgather', ] np.testing.assert_equal( ops, @@ -91,19 +93,25 @@ def run_pir_test_case(self): ) elif self._shard == 1: if paddle.distributed.get_rank() == 0: - np.testing.assert_equal(main_program.num_ops(), 4) + np.testing.assert_equal(main_program.num_ops(), 10) std_ops = [ 'builtin.parameter', 'pd_op.data', 'dist_op.shard_tensor', 'pd_op.send_v2', + 'dist_op.reshard', + 'pd_op.c_allgather', + 'pd_op.full', + 'pd_op.split_with_num', + 'pd_op.full', + 'pd_op.concat', ] np.testing.assert_equal( ops, std_ops, ) elif paddle.distributed.get_rank() == 1: - np.testing.assert_equal(main_program.num_ops(), 11) + np.testing.assert_equal(main_program.num_ops(), 9) std_ops = [ 'builtin.parameter', 'pd_op.data', @@ -112,9 +120,7 @@ def run_pir_test_case(self): 'pd_op.c_allgather', 'pd_op.full', 'pd_op.split_with_num', - 'builtin.split', 'pd_op.full', - 'builtin.combine', 'pd_op.concat', ] diff --git a/test/auto_parallel/reshard_p_to_r_cross_mesh.py b/test/auto_parallel/reshard_p_to_r_cross_mesh.py index 6960530bf3bb3..605a245cd19db 100644 --- a/test/auto_parallel/reshard_p_to_r_cross_mesh.py +++ b/test/auto_parallel/reshard_p_to_r_cross_mesh.py @@ -90,12 +90,14 @@ def run_pir_static_test_case(self): ops = [op.name() for op in main_program.global_block().ops] if paddle.distributed.get_rank() == 0: - np.testing.assert_equal(main_program.num_ops(), 4) + np.testing.assert_equal(main_program.num_ops(), 6) std_ops = [ 'builtin.parameter', 'pd_op.data', 'dist_op.shard_tensor', 'pd_op.send_v2', + 'dist_op.reshard', + 'pd_op.c_allreduce_sum_', ] else: np.testing.assert_equal(main_program.num_ops(), 5) diff --git a/test/auto_parallel/spmd_rules/test_flatten_rule.py b/test/auto_parallel/spmd_rules/test_flatten_rule.py index 599b2ddf4bf95..9a9ae6b921842 100644 --- a/test/auto_parallel/spmd_rules/test_flatten_rule.py +++ b/test/auto_parallel/spmd_rules/test_flatten_rule.py @@ -38,7 +38,7 @@ def setUp(self): def test_flatten_infer_forward(self): # shape: [8, 16, 8, 24] --> [8, 16 * 8, 24] - # dims_mapping: [0, -1, -1, 1] --> [0, -1, -1, 1] [ 0, -1, 1] + # dims_mapping: [0, -1, -1, 1] --> [0, -1, -1, 1], ([0, -1, 1], [-1, 0, -1, -1, 1] // xshape) self.x_dist_tensor_spec.set_dims_mapping([0, -1, -1, 1]) self.attrs['start_axis'] = 1 self.attrs['stop_axis'] = 2 @@ -51,14 +51,17 @@ def test_flatten_infer_forward(self): infered_output_dist_attrs = result_dist_attrs[1] self.assertEqual(len(infered_input_dist_attrs), 1) - self.assertEqual(len(infered_output_dist_attrs), 1) + self.assertEqual(len(infered_output_dist_attrs), 2) self.assertEqual( infered_input_dist_attrs[0].dims_mapping, [0, -1, -1, 1] ) self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1, 1]) + self.assertEqual( + infered_output_dist_attrs[1].dims_mapping, [-1, 0, -1, -1, 1] + ) # shape: [8, 16, 8, 24] --> [8, 16 * 8, 24] - # dims_mapping: [-1, 0, -1, 1] --> [-1, 0, -1, 1] [ -1, 0, 1] + # dims_mapping: [-1, 0, -1, 1] --> [-1, 0, -1, 1] ([ -1, 0, 1], [-1, -1, 0, -1, 1] // xshape) self.x_dist_tensor_spec.set_dims_mapping([-1, 0, -1, 1]) self.attrs['start_axis'] = 1 self.attrs['stop_axis'] = 2 @@ -74,9 +77,12 @@ def test_flatten_infer_forward(self): infered_input_dist_attrs[0].dims_mapping, [-1, 0, -1, 1] ) self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1, 0, 1]) + self.assertEqual( + infered_output_dist_attrs[1].dims_mapping, [-1, -1, 0, -1, 1] + ) # shape: [8, 16, 8, 24] --> [8, 16 * 8, 24] - # dims_mapping: [-1, -1, 1, 0] --> [-1, -1, -1, 0] [ -1, -1, 0] + # dims_mapping: [-1, -1, 1, 0] --> [-1, -1, -1, 0] ([ -1, -1, 0], [-1, -1, -1, -1, 0] // xshape) self.x_dist_tensor_spec.set_dims_mapping([-1, -1, 1, 0]) self.attrs['start_axis'] = 1 self.attrs['stop_axis'] = 2 @@ -92,9 +98,12 @@ def test_flatten_infer_forward(self): infered_input_dist_attrs[0].dims_mapping, [-1, -1, -1, 0] ) self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1, -1, 0]) + self.assertEqual( + infered_output_dist_attrs[1].dims_mapping, [-1, -1, -1, -1, 0] + ) # shape: [8, 16, 8, 24] --> [8 * 16 * 8 * 24] - # dims_mapping: [-1, 0, 1, -1] --> [-1, -1, -1, -1] [ -1] + # dims_mapping: [-1, 0, 1, -1] --> [-1, -1, -1, -1] ([ -1], [-1, -1, -1, -1, -1] // xshape) self.x_dist_tensor_spec.set_dims_mapping([-1, 0, 1, -1]) self.attrs['start_axis'] = 0 self.attrs['stop_axis'] = -1 @@ -110,9 +119,12 @@ def test_flatten_infer_forward(self): infered_input_dist_attrs[0].dims_mapping, [-1, -1, -1, -1] ) self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1]) + self.assertEqual( + infered_output_dist_attrs[1].dims_mapping, [-1, -1, -1, -1, -1] + ) # shape: [8, 16, 8, 24] --> [8 * 16 * 8 * 24] - # dims_mapping: [0, -1, -1, 1] --> [0, -1, -1, -1] [ 0] + # dims_mapping: [0, -1, -1, 1] --> [0, -1, -1, -1] ([ 0], [-1, 0, -1, -1, -1] // xshape) self.x_dist_tensor_spec.set_dims_mapping([0, -1, -1, 1]) self.attrs['start_axis'] = 0 self.attrs['stop_axis'] = -1 @@ -128,9 +140,12 @@ def test_flatten_infer_forward(self): infered_input_dist_attrs[0].dims_mapping, [0, -1, -1, -1] ) self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0]) + self.assertEqual( + infered_output_dist_attrs[1].dims_mapping, [-1, 0, -1, -1, -1] + ) # shape: [8, 16, 8, 24] --> [8 * 16 * 8 * 24] - # dims_mapping: [1, 0, -1, -1] --> [1, -1, -1, -1] [ 1] + # dims_mapping: [1, 0, -1, -1] --> [1, -1, -1, -1] ([ 1], [-1, 1, -1, -1, -1] // xshape) self.x_dist_tensor_spec.set_dims_mapping([1, 0, -1, -1]) self.attrs['start_axis'] = 0 self.attrs['stop_axis'] = -1 @@ -146,9 +161,12 @@ def test_flatten_infer_forward(self): infered_input_dist_attrs[0].dims_mapping, [1, -1, -1, -1] ) self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [1]) + self.assertEqual( + infered_output_dist_attrs[1].dims_mapping, [-1, 1, -1, -1, -1] + ) # shape: [8, 16, 8, 24] --> [8, 16 * 8 * 24] - # dims_mapping: [-1, -1, 0, 1] --> [-1, -1, -1, -1] [-1, -1] + # dims_mapping: [-1, -1, 0, 1] --> [-1, -1, -1, -1] ([-1, -1], [-1, -1, -1, -1, -1] // xshape) self.x_dist_tensor_spec.set_dims_mapping([-1, -1, 0, 1]) self.attrs['start_axis'] = 1 self.attrs['stop_axis'] = -1 @@ -164,9 +182,12 @@ def test_flatten_infer_forward(self): infered_input_dist_attrs[0].dims_mapping, [-1, -1, -1, -1] ) self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1, -1]) + self.assertEqual( + infered_output_dist_attrs[1].dims_mapping, [-1, -1, -1, -1, -1] + ) # shape: [8, 16, 8, 24] --> [8, 16 * 8 * 24] - # dims_mapping: [-1, 0, -1, 1] --> [-1, 0, -1, -1] [-1, 0] + # dims_mapping: [-1, 0, -1, 1] --> [-1, 0, -1, -1] ([-1, 0], [-1, -1, 0, -1, -1] // xshape) self.x_dist_tensor_spec.set_dims_mapping([-1, 0, -1, 1]) self.attrs['start_axis'] = 1 self.attrs['stop_axis'] = -1 @@ -182,9 +203,12 @@ def test_flatten_infer_forward(self): infered_input_dist_attrs[0].dims_mapping, [-1, 0, -1, -1] ) self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1, 0]) + self.assertEqual( + infered_output_dist_attrs[1].dims_mapping, [-1, -1, 0, -1, -1] + ) # shape: [8, 16, 8, 24] --> [8, 16 * 8 * 24] - # dims_mapping: [0, 1, -1, -1] --> [0, 1, -1, -1] [0, 1] + # dims_mapping: [0, 1, -1, -1] --> [0, 1, -1, -1] ([0, 1], [-1, 0, 1, -1, -1] // xshape) self.x_dist_tensor_spec.set_dims_mapping([0, 1, -1, -1]) self.attrs['start_axis'] = 1 self.attrs['stop_axis'] = -1 @@ -200,6 +224,9 @@ def test_flatten_infer_forward(self): infered_input_dist_attrs[0].dims_mapping, [0, 1, -1, -1] ) self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, 1]) + self.assertEqual( + infered_output_dist_attrs[1].dims_mapping, [-1, 0, 1, -1, -1] + ) def test_flatten_infer_backward(self): process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2, 3], [4, 5, 6, 7]]) diff --git a/test/cpp/auto_parallel/spmd_rule_test.cc b/test/cpp/auto_parallel/spmd_rule_test.cc index 0ae0b8ed3eaf1..d908261840cfc 100644 --- a/test/cpp/auto_parallel/spmd_rule_test.cc +++ b/test/cpp/auto_parallel/spmd_rule_test.cc @@ -1853,6 +1853,70 @@ TEST(CumSumGradInferSpmd, Ctor) { std::vector<int64_t>({-1, -1, -1})); } +TEST(Flatten, Ctor) { + std::vector<int64_t> mesh_shape = {2, 2}; + std::vector<int64_t> process_ids = {0, 1, 2, 3}; + std::vector<std::string> dim_names = {"x", "y"}; + ProcessMesh process_mesh(mesh_shape, process_ids, dim_names); + + auto build_input = [&](const std::vector<int64_t>& shape, + const std::vector<int64_t>& dim_mapping) { + auto t_dist_attr = TensorDistAttr(); + t_dist_attr.set_process_mesh(process_mesh); + t_dist_attr.set_dims_mapping(dim_mapping); + t_dist_attr.set_dynamic_dims(std::vector<bool>(shape.size(), false)); + auto input = + phi::distributed::DistMetaTensor(common::make_ddim(shape), t_dist_attr); + return input; + }; + + // [b, h/ph, w/pw, c, ph, pw]; dp + auto input1 = build_input({4, 16, 16, 4, 2, 2}, {0, -1, -1, -1, -1, -1}); + // [b, h/ph, w/pw, c, ph, pw] => [b, h/ph, w/pw, hidden_size] + auto spmd1 = FlattenInferSpmd(input1, -3, -1); + EXPECT_EQ(spmd1.first.size(), static_cast<size_t>(1)); + EXPECT_EQ(spmd1.second.size(), static_cast<size_t>(2)); + check_dim_mapping(spmd1.first[0], {0, -1, -1, -1, -1, -1}); + check_dim_mapping(spmd1.second[0], {0, -1, -1, -1}); + check_dim_mapping(spmd1.second[1], {-1, 0, -1, -1, -1, -1, -1}); // x_shape + + // [b, h/ph, w/pw, c, ph, pw]; dp, mp + auto input2 = build_input({4, 16, 16, 4, 2, 2}, {-1, 0, -1, 1, -1, -1}); + auto spmd2 = FlattenInferSpmd(input2, 1, 4); + EXPECT_EQ(spmd2.first.size(), static_cast<size_t>(1)); + EXPECT_EQ(spmd2.second.size(), static_cast<size_t>(2)); + check_dim_mapping(spmd2.first[0], {-1, 0, -1, -1, -1, -1}); + check_dim_mapping(spmd2.second[0], {-1, 0, -1}); + check_dim_mapping(spmd2.second[1], {-1, -1, 0, -1, -1, -1, -1}); // x_shape + + // [b, s, nh, h/nh]; dp , mp + auto input3 = build_input({2, 1024, 32, 32}, {0, -1, 1, -1}); + // [b, s, nh, h/nh] => [b, s, h] + auto spmd3 = FlattenInferSpmd(input3, 2, 3); + EXPECT_EQ(spmd3.first.size(), static_cast<size_t>(1)); + EXPECT_EQ(spmd3.second.size(), static_cast<size_t>(2)); + check_dim_mapping(spmd3.first[0], {0, -1, 1, -1}); + check_dim_mapping(spmd3.second[0], {0, -1, 1}); + check_dim_mapping(spmd3.second[1], {-1, 0, -1, 1, -1}); // x_shape + + // [b, c, d, h, w]; dp, mp + auto input4 = build_input({4, 16, 16, 4, 16}, {-1, -1, 0, 1, -1}); + auto spmd4 = FlattenInferSpmd(input4, 1, 4); + EXPECT_EQ(spmd4.first.size(), static_cast<size_t>(1)); + EXPECT_EQ(spmd4.second.size(), static_cast<size_t>(2)); + check_dim_mapping(spmd4.first[0], {-1, -1, -1, -1, -1}); + check_dim_mapping(spmd4.second[0], {-1, -1}); + check_dim_mapping(spmd4.second[1], {-1, -1, -1, -1, -1, -1}); // x_shape + + auto out_grad = build_input({2, 1024, 1024}, {-1, -1, -1}); + auto xshape = build_input({0, 2, 1024, 4, 1024 / 4}, {-1, 0, 1, -1, -1}); + auto spmd_grad = FlattenGradInferSpmd(xshape, out_grad); + EXPECT_EQ(spmd_grad.first.size(), static_cast<size_t>(1)); + EXPECT_EQ(spmd_grad.second.size(), static_cast<size_t>(1)); + check_dim_mapping(spmd_grad.first[0], {0, 1, -1}); + check_dim_mapping(spmd_grad.second[0], {0, 1, -1, -1}); +} + } // namespace auto_parallel } // namespace distributed } // namespace paddle diff --git a/test/cpp/pir/cinn/compilation_task_test.cc b/test/cpp/pir/cinn/compilation_task_test.cc index 254ab7c4baf8a..3fbe4ed4ba60b 100644 --- a/test/cpp/pir/cinn/compilation_task_test.cc +++ b/test/cpp/pir/cinn/compilation_task_test.cc @@ -24,6 +24,7 @@ #include "paddle/cinn/hlir/dialect/operator/ir/op_attribute.h" #include "paddle/cinn/hlir/dialect/operator/ir/op_dialect.h" #include "paddle/cinn/hlir/framework/pir/compilation_task.h" +#include "paddle/cinn/hlir/framework/pir/utils.h" #include "paddle/cinn/hlir/framework/pir_compiler.h" #include "paddle/cinn/utils/data_util.h" #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h" @@ -34,6 +35,7 @@ PD_DECLARE_bool(cinn_bucket_compile); +using cinn::hlir::framework::pir::CompatibleInfo; using cinn::hlir::framework::pir::OpLoweringGroup; using cinn::hlir::framework::pir::OpLoweringGroupPtr; @@ -50,8 +52,11 @@ ProgramInfo BuildProgram(std::vector<int64_t> input_shape) { input_shape, value_one, phi::DataType::FLOAT32, phi::GPUPlace()); std::vector<OpLoweringGroupPtr> groups; + const std::string fn_name = CompatibleInfo::GroupOpsName( + std::initializer_list<::pir::Operation*>({full_op_x.operation()})); groups.emplace_back(std::make_shared<OpLoweringGroup>( - std::initializer_list<::pir::Operation*>({full_op_x.operation()}))); + std::initializer_list<::pir::Operation*>({full_op_x.operation()}), + fn_name)); groups.back()->mut_output_ops().insert(full_op_x.operation()); return {program, groups}; diff --git a/test/cpp/pir/cinn/file_tile_config_test.cc b/test/cpp/pir/cinn/file_tile_config_test.cc index 3cdcc7a390bbe..d863baca924f7 100644 --- a/test/cpp/pir/cinn/file_tile_config_test.cc +++ b/test/cpp/pir/cinn/file_tile_config_test.cc @@ -39,7 +39,7 @@ TEST(ConfigSearcher, TestReduceDemo) { constexpr int kMaxThreadsPerBlock = 1024; // Step 1: Construct iter space and tile config. - cinn::ir::search::IterSpace iter_space; + cinn::ir::BucketInfo bucket_info; int s_dimension_lower = 32; int s_dimension_upper = 128; auto s_dimension_type = "S"; @@ -49,61 +49,52 @@ TEST(ConfigSearcher, TestReduceDemo) { auto r_dimension_type = "R"; auto r_dimension_is_dynamic = true; - iter_space.space.push_back(cinn::ir::search::IterSpace::Dimension{ - s_dimension_lower, - s_dimension_upper, - s_dimension_type, - s_dimension_is_dynamic, - std::vector<double>(128 - 32, 1.0)}); - iter_space.space.push_back( - cinn::ir::search::IterSpace::Dimension{r_dimension_lower, - r_dimension_upper, - r_dimension_type, - r_dimension_is_dynamic, - std::vector<double>(1, 1.0)}); - cinn::ir::BucketInfo bucket_info; - bucket_info.sp_lower_bound = iter_space.space[0].lower_bound; - bucket_info.sp_upper_bound = iter_space.space[0].upper_bound; - bucket_info.rb_lower_bound = iter_space.space[1].lower_bound; - bucket_info.rb_upper_bound = iter_space.space[1].upper_bound; + bucket_info.space.push_back( + cinn::ir::BucketInfo::Dimension{s_dimension_lower, + s_dimension_upper, + s_dimension_type, + s_dimension_is_dynamic, + std::vector<double>(128 - 32, 1.0)}); + bucket_info.space.push_back( + cinn::ir::BucketInfo::Dimension{r_dimension_lower, + r_dimension_upper, + r_dimension_type, + r_dimension_is_dynamic, + std::vector<double>(1, 1.0)}); + cinn::ir::ScheduleConfig::TileConfig tile_config; tile_config.spatial_inner_num = 32; tile_config.warp_num = 32; tile_config.tree_reduce_num = 128; std::vector<std::pair<std::string, std::string>> iter_space_type = { - std::make_pair("R", "dynamic"), std::make_pair("S", "dynamic")}; + std::make_pair("S", "dynamic"), std::make_pair("R", "dynamic")}; // Step 2: Add to json/Read from json cinn::ir::FileTileConfigDatabase file_database; - file_database.AddConfig(cinn::common::DefaultTarget(), - iter_space_type, - bucket_info, - tile_config, - 2); + file_database.AddConfig( + cinn::common::DefaultTarget(), bucket_info, tile_config, 2); cinn::ir::TileConfigMap tile_config_map = file_database.GetConfigs(cinn::common::DefaultTarget(), iter_space_type); for (auto& it : tile_config_map) { - LOG(INFO) << "sp_lower_bound is " << it.first.sp_lower_bound; - LOG(INFO) << "sp_upper_bound is " << it.first.sp_upper_bound; - LOG(INFO) << "rb_lower_bound is " << it.first.rb_lower_bound; - LOG(INFO) << "rb_upper_bound is " << it.first.rb_upper_bound; + LOG(INFO) << "bucket info is: "; + auto dims = it.first.space.size(); + for (int i = 0; i < dims; i++) { + LOG(INFO) << "Dimension " << i + << " 's lower_bound is: " << it.first.space[i].lower_bound; + LOG(INFO) << "Dimension " << i + << " 's upper_bound is: " << it.first.space[i].upper_bound; + auto dimension_lower = i == 0 ? s_dimension_lower : r_dimension_lower; + auto dimension_upper = i == 0 ? s_dimension_upper : r_dimension_upper; + PADDLE_ENFORCE_EQ(it.first.space[i].lower_bound, + dimension_lower, + ::common::errors::InvalidArgument( + "GetConfigs function gets wrong dimension_lower")); + PADDLE_ENFORCE_EQ(it.first.space[i].upper_bound, + dimension_upper, + ::common::errors::InvalidArgument( + "GetConfigs function gets wrong dimension_upper")); + } LOG(INFO) << "tile config is " << it.second.spatial_inner_num << " " << it.second.warp_num << " " << it.second.tree_reduce_num; - PADDLE_ENFORCE_EQ(it.first.sp_lower_bound, - s_dimension_lower, - ::common::errors::InvalidArgument( - "GetConfigs function gets wrong s_dimension_lower")); - PADDLE_ENFORCE_EQ(it.first.sp_upper_bound, - s_dimension_upper, - ::common::errors::InvalidArgument( - "GetConfigs function gets wrong s_dimension_upper")); - PADDLE_ENFORCE_EQ(it.first.rb_lower_bound, - r_dimension_lower, - ::common::errors::InvalidArgument( - "GetConfigs function gets wrong r_dimension_lower")); - PADDLE_ENFORCE_EQ(it.first.rb_upper_bound, - r_dimension_upper, - ::common::errors::InvalidArgument( - "GetConfigs function gets wrong r_dimension_upprt")); PADDLE_ENFORCE_EQ(it.second.spatial_inner_num, tile_config.spatial_inner_num, ::common::errors::InvalidArgument( diff --git a/test/cpp/pir/cinn/pir_compiler_test.cc b/test/cpp/pir/cinn/pir_compiler_test.cc index 8e2df8e02ac8c..622a4fec701f1 100644 --- a/test/cpp/pir/cinn/pir_compiler_test.cc +++ b/test/cpp/pir/cinn/pir_compiler_test.cc @@ -25,6 +25,7 @@ #include "paddle/cinn/hlir/dialect/operator/ir/op_dialect.h" #include "paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.h" #include "paddle/cinn/hlir/dialect/runtime/ir/runtime_dialect.h" +#include "paddle/cinn/hlir/framework/pir/utils.h" #include "paddle/cinn/hlir/framework/pir_compiler.h" #include "paddle/cinn/utils/data_util.h" #include "paddle/fluid/framework/new_executor/interpretercore.h" @@ -38,6 +39,7 @@ #include "paddle/pir/include/core/program.h" #include "paddle/pir/include/dialect/control_flow/ir/cf_op.h" +using cinn::hlir::framework::pir::CompatibleInfo; using cinn::hlir::framework::pir::OpLoweringGroup; using cinn::hlir::framework::pir::OpLoweringGroupPtr; @@ -74,18 +76,26 @@ ProgramInfo BuildProgram() { builder.Build<pir::YieldOp>(std::vector<pir::Value>{relu_op_y.result(0)}); std::vector<OpLoweringGroupPtr> groups; + const auto full_op_x_ops = + std::initializer_list<::pir::Operation*>({full_op_x.operation()}); groups.emplace_back(std::make_shared<OpLoweringGroup>( - std::initializer_list<::pir::Operation*>( - {full_op_x.operation()}))); // For coverage + full_op_x_ops, + CompatibleInfo::GroupOpsName(full_op_x_ops))); // For coverage groups[0]->mut_output_values().push_back(groups[0]->ops().back()->result(0)); + + const auto full_op_y_ops = + std::initializer_list<::pir::Operation*>({full_op_x.operation()}); groups.emplace_back(std::make_shared<OpLoweringGroup>( - std::initializer_list<::pir::Operation*>({full_op_y.operation()}))); + full_op_y_ops, CompatibleInfo::GroupOpsName(full_op_y_ops))); + groups[1]->mut_output_values().push_back(groups[1]->ops().back()->result(0)); - groups.emplace_back(std::make_shared<OpLoweringGroup>( + const auto vector_ops = std::vector<::pir::Operation*>({tan_op_x.operation(), relu_op_x.operation(), tan_op_y.operation(), - relu_op_y.operation()}))); + relu_op_y.operation()}); + groups.emplace_back(std::make_shared<OpLoweringGroup>( + vector_ops, CompatibleInfo::GroupOpsName(vector_ops))); groups[2]->mut_output_values().push_back(groups[2]->ops().back()->result(0)); return {program, groups}; @@ -127,14 +137,16 @@ ProgramInfo BuildSoftmax() { auto yield_op = builder.Build<pir::YieldOp>(std::vector<pir::Value>{divide}); std::vector<OpLoweringGroupPtr> groups; - groups.emplace_back(std::make_shared<OpLoweringGroup>( + const auto vector_ops = std::initializer_list<::pir::Operation*>({max.defining_op(), broadcast_1.defining_op(), sub.defining_op(), exp.defining_op(), sum.defining_op(), broadcast_2.defining_op(), - divide.defining_op()}))); + divide.defining_op()}); + groups.emplace_back(std::make_shared<OpLoweringGroup>( + vector_ops, CompatibleInfo::GroupOpsName(vector_ops))); groups[0]->mut_output_values().push_back(groups[0]->ops().back()->result(0)); groups[0]->set_op_pattern_kind(cinn::hlir::framework::kReduction); diff --git a/test/cpp/pir/cinn/symbolic_lower_test.cc b/test/cpp/pir/cinn/symbolic_lower_test.cc index 83de069dd622e..0c748d9b96da8 100644 --- a/test/cpp/pir/cinn/symbolic_lower_test.cc +++ b/test/cpp/pir/cinn/symbolic_lower_test.cc @@ -24,6 +24,7 @@ #include "paddle/cinn/hlir/framework/pir/group.h" #include "paddle/cinn/hlir/framework/pir/op_lowering_group.h" #include "paddle/cinn/hlir/framework/pir/op_lowering_impl.h" +#include "paddle/cinn/hlir/framework/pir/utils.h" #include "paddle/cinn/hlir/framework/pir_compiler.h" #include "paddle/common/ddim.h" #include "paddle/fluid/framework/new_executor/interpretercore.h" @@ -39,6 +40,7 @@ PD_DECLARE_bool(cinn_bucket_compile); +using cinn::hlir::framework::pir::CompatibleInfo; using cinn::hlir::framework::pir::OpLoweringGroup; using cinn::hlir::framework::pir::OpLoweringGroupPtr; @@ -88,9 +90,11 @@ BuildGroupProgramForLowering() { builder.Build<paddle::dialect::FetchOp>(group_op->result(0), "out", 0); std::vector<OpLoweringGroupPtr> groups; - groups.emplace_back( - std::make_shared<OpLoweringGroup>(std::vector<::pir::Operation*>( - {exp.operation(), reshape.operation(), sub.operation()}))); + groups.emplace_back(std::make_shared<OpLoweringGroup>( + std::vector<::pir::Operation*>( + {exp.operation(), reshape.operation(), sub.operation()}), + CompatibleInfo::GroupOpsName(std::vector<::pir::Operation*>( + {exp.operation(), reshape.operation(), sub.operation()})))); groups[0]->mut_output_ops().insert(groups[0]->ops().back()); std::unordered_map<::pir::Value, symbol::ShapeOrDataDimExprs> value_to_shape_data; @@ -176,9 +180,11 @@ BuildBroadcastGroupProgramForLowering() { builder.Build<paddle::dialect::FetchOp>(group_op->result(0), "out", 0); std::vector<OpLoweringGroupPtr> groups; - groups.emplace_back( - std::make_shared<OpLoweringGroup>(std::vector<::pir::Operation*>( - {x_broadcast.operation(), sub.operation()}))); + groups.emplace_back(std::make_shared<OpLoweringGroup>( + std::vector<::pir::Operation*>( + {x_broadcast.operation(), sub.operation()}), + CompatibleInfo::GroupOpsName(std::vector<::pir::Operation*>( + {x_broadcast.operation(), sub.operation()})))); groups[0]->mut_output_ops().insert(groups[0]->ops().back()); std::unordered_map<::pir::Value, symbol::ShapeOrDataDimExprs> diff --git a/test/cpp/pir/cinn/tile_config_searcher_test.cc b/test/cpp/pir/cinn/tile_config_searcher_test.cc index f54aa848b655a..289113a96bbab 100644 --- a/test/cpp/pir/cinn/tile_config_searcher_test.cc +++ b/test/cpp/pir/cinn/tile_config_searcher_test.cc @@ -66,22 +66,22 @@ TEST(ConfigSearcher, TestReduceDemo) { schedule_config_manager.SetPolicy("custom"); // Step 3: Construct iter space and objective function. - cinn::ir::search::IterSpace iter_space; - iter_space.space.push_back(cinn::ir::search::IterSpace::Dimension{ - 33, - 128, - "S", - /* is_dynamic = */ true, - std::vector<double>(128 - 32, 1.0)}); - iter_space.space.push_back( - cinn::ir::search::IterSpace::Dimension{1024, - 1024, - "R", - /* is_dynamic = */ false, - std::vector<double>(1, 1.0)}); + cinn::ir::BucketInfo bucket_info; + bucket_info.space.push_back( + cinn::ir::BucketInfo::Dimension{33, + 128, + "S", + /* is_dynamic = */ true, + std::vector<double>(128 - 32, 1.0)}); + bucket_info.space.push_back( + cinn::ir::BucketInfo::Dimension{1024, + 1024, + "R", + /* is_dynamic = */ false, + std::vector<double>(1, 1.0)}); std::unique_ptr<cinn::ir::search::BaseObjectiveFunc> obj_func = std::make_unique<cinn::ir::search::WeightedSamplingTrailObjectiveFunc>( - program.get(), iter_space); + program.get(), bucket_info); // Step 4: Construct config candidate range and constraints. std::vector<std::pair<int, int>> candidate_range{ diff --git a/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc b/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc index 9ec1928ef10ff..a7674d60451cd 100644 --- a/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc +++ b/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc @@ -426,7 +426,7 @@ TEST(pattern_rewrite, Patterns) { // true)); CHECK_EQ(pm.Run(&program), true); - EXPECT_EQ(program.block()->size(), 19u); + EXPECT_EQ(program.block()->size(), 17u); } void BuildConstantFoldingProgram(pir::Program *program, diff --git a/test/deprecated/legacy_test/CMakeLists.txt b/test/deprecated/legacy_test/CMakeLists.txt index 18891bc1cb65e..12d4734020e2c 100644 --- a/test/deprecated/legacy_test/CMakeLists.txt +++ b/test/deprecated/legacy_test/CMakeLists.txt @@ -130,7 +130,6 @@ endif() if(WIN32) list(REMOVE_ITEM TEST_OPS test_complex_matmul) - list(REMOVE_ITEM TEST_OPS test_ops_nms) list(REMOVE_ITEM TEST_OPS test_trt_convert_preln_residual_bias) list(REMOVE_ITEM TEST_OPS test_masked_multihead_attention_op) list(REMOVE_ITEM TEST_OPS test_fused_ec_moe_op) @@ -401,8 +400,6 @@ endfunction() list(REMOVE_ITEM TEST_OPS test_feed_data_check_shape_type) list(REMOVE_ITEM TEST_OPS test_fetch_lod_tensor_array) list(REMOVE_ITEM TEST_OPS test_data_norm_op) -list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op) -list(REMOVE_ITEM TEST_OPS test_nearest_interp_op) list(REMOVE_ITEM TEST_OPS test_imperative_mnist_sorted_gradient) list(REMOVE_ITEM TEST_OPS test_imperative_mnist) list(REMOVE_ITEM TEST_OPS test_layers_deprecated) @@ -452,8 +449,7 @@ endif() # Some ops need to check results when gc is enabled # Currently, only ops that register NoNeedBufferVarsInference need to do this test -set(TEST_OPS_WITH_GC test_affine_channel_op test_gather_nd_op test_scatter_op - test_slice_op) +set(TEST_OPS_WITH_GC test_gather_nd_op test_slice_op) foreach(TEST_OP ${TEST_OPS_WITH_GC}) list(REMOVE_ITEM TEST_OPS ${TEST_OP}) @@ -485,10 +481,6 @@ set_tests_properties(test_logcumsumexp_op PROPERTIES TIMEOUT 30) py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4) -py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op ENVS - ${GC_ENVS}) -py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op ENVS - ${GC_ENVS}) py_test_modules(test_imperative_mnist MODULES test_imperative_mnist ENVS FLAGS_cudnn_deterministic=1) py_test_modules( @@ -511,8 +503,6 @@ if((WITH_GPU) AND (WITH_CUDNN_FRONTEND)) test_fused_dot_product_attention_op) endif() -set_tests_properties(test_conv2d_op_depthwise_conv - PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE") set_tests_properties(test_conv2d_api_deprecated PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE") set_tests_properties(test_conv_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE") @@ -644,17 +634,10 @@ endif() set_tests_properties(test_cross_op PROPERTIES TIMEOUT 120) set_tests_properties(test_imperative_lod_tensor_to_selected_rows PROPERTIES TIMEOUT 200) -set_tests_properties(test_lstm_op PROPERTIES TIMEOUT 120) -set_tests_properties(test_imperative_star_gan_with_gradient_penalty - PROPERTIES TIMEOUT 120) -set_tests_properties(test_bicubic_interp_op PROPERTIES TIMEOUT 120) set_tests_properties(test_deformable_conv_op PROPERTIES TIMEOUT 200) -set_tests_properties(test_nearest_interp_op PROPERTIES TIMEOUT 120) set_tests_properties(test_inplace_softmax_with_cross_entropy PROPERTIES TIMEOUT 120) -set_tests_properties(test_cross_entropy2_op PROPERTIES TIMEOUT 120) -set_tests_properties(test_gru_unit_op PROPERTIES TIMEOUT 120) set_tests_properties(test_regularizer_api PROPERTIES TIMEOUT 150) if(NOT WIN32) if(WITH_NV_JETSON) @@ -666,75 +649,45 @@ set_tests_properties(test_bilateral_slice_op PROPERTIES TIMEOUT 120) set_tests_properties(test_fleet_util PROPERTIES TIMEOUT 120) set_tests_properties(test_imperative_transformer_sorted_gradient PROPERTIES TIMEOUT 120) -set_tests_properties(test_matmul_op PROPERTIES TIMEOUT 120) -set_tests_properties(test_nearest_interp_v2_op PROPERTIES TIMEOUT 120) -set_tests_properties(test_trilinear_interp_op PROPERTIES TIMEOUT 120) set_tests_properties(test_static_save_load PROPERTIES TIMEOUT 250) -set_tests_properties(test_paddle_save_load_binary PROPERTIES TIMEOUT 120) -if(WIN32) - set_tests_properties(test_static_save_load_large PROPERTIES TIMEOUT 900) -else() - set_tests_properties(test_static_save_load_large PROPERTIES TIMEOUT 600) -endif() if(WITH_NV_JETSON) set_tests_properties(test_conv3d_transpose_part2_op PROPERTIES TIMEOUT 1200) set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 1500) - set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 1500) else() set_tests_properties(test_conv3d_transpose_part2_op PROPERTIES TIMEOUT 120) set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 250) - set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 150) endif() set_tests_properties(test_imperative_selected_rows_to_lod_tensor PROPERTIES TIMEOUT 200) -set_tests_properties(test_index_select_op PROPERTIES TIMEOUT 120) -set_tests_properties(test_index_add_op PROPERTIES TIMEOUT 120) set_tests_properties(test_argsort_op PROPERTIES TIMEOUT 120) set_tests_properties(test_gather_nd_op PROPERTIES TIMEOUT 120) -set_tests_properties(test_row_conv_op PROPERTIES TIMEOUT 120) -set_tests_properties(test_deformable_conv_v1_op PROPERTIES TIMEOUT 300) set_tests_properties(test_imperative_ptb_rnn_sorted_gradient PROPERTIES TIMEOUT 120) -set_tests_properties(test_crop_tensor_op PROPERTIES TIMEOUT 120) set_tests_properties(test_imperative_ptb_rnn PROPERTIES TIMEOUT 120) -set_tests_properties(test_svd_op PROPERTIES TIMEOUT 80) -set_tests_properties(test_qr_op PROPERTIES TIMEOUT 60) -set_tests_properties(test_trilinear_interp_v2_op PROPERTIES TIMEOUT 120) set_tests_properties(test_masked_select_op PROPERTIES TIMEOUT 120) set_tests_properties(test_sigmoid_cross_entropy_with_logits_op PROPERTIES TIMEOUT 120) set_tests_properties(test_imperative_optimizer_v2 PROPERTIES TIMEOUT 150) -set_tests_properties(test_partial_sum_op PROPERTIES TIMEOUT 120) set_tests_properties(test_sgd_op PROPERTIES TIMEOUT 250) set_tests_properties(test_generator_dataloader_deprecated PROPERTIES TIMEOUT 120) -set_tests_properties(test_partial_concat_op PROPERTIES TIMEOUT 120) set_tests_properties(test_reduce_op PROPERTIES TIMEOUT 500) set_tests_properties(test_conv_nn_grad PROPERTIES TIMEOUT 220) set_tests_properties(test_program_prune_backward PROPERTIES TIMEOUT 120) set_tests_properties(test_imperative_optimizer_v2 PROPERTIES TIMEOUT 250) -set_tests_properties(test_bilinear_interp_op PROPERTIES TIMEOUT 120) set_tests_properties(test_decoupled_py_reader PROPERTIES TIMEOUT 120) set_tests_properties(test_fuse_bn_act_pass PROPERTIES TIMEOUT 120) -set_tests_properties(test_conv2d_op_depthwise_conv PROPERTIES TIMEOUT 120) set_tests_properties(test_conv2d_api_deprecated PROPERTIES TIMEOUT 120) -set_tests_properties(test_elementwise_mul_op PROPERTIES TIMEOUT 120) set_tests_properties(test_dygraph_multi_forward PROPERTIES TIMEOUT 120) set_tests_properties(test_imperative_ocr_attention_model PROPERTIES TIMEOUT 120) set_tests_properties(test_imperative_mnist PROPERTIES TIMEOUT 120) -set_tests_properties(test_gru_op PROPERTIES TIMEOUT 200) set_tests_properties(test_regularizer PROPERTIES TIMEOUT 150) -set_tests_properties(test_matmul_v2_op PROPERTIES TIMEOUT 120) set_tests_properties(test_slice_op PROPERTIES TIMEOUT 120) -set_tests_properties(test_pad3d_op PROPERTIES TIMEOUT 120) set_tests_properties(test_dataloader_keep_order PROPERTIES TIMEOUT 120) set_tests_properties(test_dataloader_unkeep_order PROPERTIES TIMEOUT 120) set_tests_properties(test_reader_reset PROPERTIES TIMEOUT 120) -set_tests_properties(test_cumprod_op PROPERTIES TIMEOUT 300) set_tests_properties(test_split_program PROPERTIES TIMEOUT 120) -set_tests_properties(test_graph_send_ue_recv_op PROPERTIES TIMEOUT 60) -set_tests_properties(test_graph_send_uv_op PROPERTIES TIMEOUT 60) set_tests_properties(test_uniform_random_op_deprecated PROPERTIES TIMEOUT 60) set_tests_properties(test_pretrained_model PROPERTIES TIMEOUT 120) @@ -756,36 +709,17 @@ set_tests_properties(test_inplace_addto_strategy_deprecated PROPERTIES TIMEOUT set(TEST_CINN_OPS test_softmax_op - test_expand_v2_op test_reduce_op test_slice_op - test_full_like_op - test_index_select_op - test_top_k_v2_op - test_elementwise_mul_op test_gather_nd_op - test_elementwise_pow_op - test_reshape_op - test_meshgrid_op test_scale_op - test_scatter_op test_layer_norm_op - test_cast_op - test_roll_op - test_atan2_op - test_top_k_op test_where_op test_arg_min_max_op - test_reverse_op - test_flip - test_triangular_solve_op test_scatter_nd_op test_instance_norm_op test_cumsum_op - test_split_op - test_erf_op - test_assign_op - test_flatten_contiguous_range_op) + test_erf_op) foreach(TEST_CINN_OP ${TEST_CINN_OPS}) if(WITH_CINN) @@ -810,16 +744,12 @@ set(STATIC_BUILD_TESTS test_batch_norm_op test_bincount_op test_decoupled_py_reader - test_eigh_op test_fetch_lod_tensor_array test_fuse_bn_act_pass test_layer_norm_op test_lookup_table_v2_op_deprecated - test_matmul_op - test_matmul_v2_op test_momentum_op test_nce - test_paddle_save_load_binary test_reduce_op test_sparse_conv_op test_sparse_norm_op @@ -863,11 +793,7 @@ set_tests_properties( ENVIRONMENT "FLAGS_cudnn_deterministic=1;FLAGS_cudnn_batchnorm_spatial_persistent=1;FLAGS_conv_workspace_size_limit=1000" ) -set_tests_properties(test_matmul_op_static_build PROPERTIES TIMEOUT 120) -set_tests_properties(test_matmul_v2_op_static_build PROPERTIES TIMEOUT 120) set_tests_properties(test_layer_norm_op_static_build PROPERTIES TIMEOUT 1500) -set_tests_properties(test_paddle_save_load_binary_static_build - PROPERTIES TIMEOUT 120) set_tests_properties(test_reduce_op_static_build PROPERTIES TIMEOUT 500) py_test_modules(test_stride MODULES test_stride ENVS FLAGS_use_stride_kernel=true) @@ -875,6 +801,5 @@ py_test_modules(test_stride MODULES test_stride ENVS set_tests_properties(test_linalg_matrix_exp PROPERTIES TIMEOUT 120) set_pir_tests_properties() -set_tests_properties(test_fractional_max_pool2d_op PROPERTIES TIMEOUT 120) - set_tests_properties(test_reduce_as_op PROPERTIES TIMEOUT 30) +set_tests_properties(test_attribute_var_deprecated PROPERTIES TIMEOUT 100) diff --git a/test/deprecated/legacy_test/test_adaptive_avg_pool2d.py b/test/deprecated/legacy_test/test_adaptive_avg_pool2d.py index 880a7cf949a62..5ed16ca8675b1 100644 --- a/test/deprecated/legacy_test/test_adaptive_avg_pool2d.py +++ b/test/deprecated/legacy_test/test_adaptive_avg_pool2d.py @@ -16,7 +16,7 @@ import unittest import numpy as np -from test_attribute_var import UnittestBase +from test_attribute_var_deprecated import UnittestBase import paddle from paddle.base import Program, core, program_guard diff --git a/test/deprecated/legacy_test/test_arg_min_max_op.py b/test/deprecated/legacy_test/test_arg_min_max_op.py index c35fa9f8f7d39..69b98997aeed5 100644 --- a/test/deprecated/legacy_test/test_arg_min_max_op.py +++ b/test/deprecated/legacy_test/test_arg_min_max_op.py @@ -17,7 +17,7 @@ import numpy as np from op_test import OpTest, convert_float_to_uint16 -from test_attribute_var import UnittestBase +from test_attribute_var_deprecated import UnittestBase import paddle from paddle.base import Program, program_guard diff --git a/test/deprecated/legacy_test/test_attribute_var_deprecated.py b/test/deprecated/legacy_test/test_attribute_var_deprecated.py new file mode 100644 index 0000000000000..5f09dff909395 --- /dev/null +++ b/test/deprecated/legacy_test/test_attribute_var_deprecated.py @@ -0,0 +1,107 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import tempfile +import unittest + +import numpy as np + +import paddle +import paddle.inference as paddle_infer +from paddle.base.framework import Program, program_guard + +paddle.enable_static() + + +class UnittestBase(unittest.TestCase): + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + self.init_info() + + def tearDwon(self): + self.temp_dir.cleanup() + + def init_info(self): + self.shapes = None + self.save_path = None + + def path_prefix(self): + return type(self).__name__ + + def infer_prog(self): + config = paddle_infer.Config( + self.save_path + '.pdmodel', self.save_path + '.pdiparams' + ) + config.disable_mkldnn() + predictor = paddle_infer.create_predictor(config) + input_names = predictor.get_input_names() + for i, shape in enumerate(self.shapes): + input_handle = predictor.get_input_handle(input_names[i]) + self.fake_input = np.random.randn(*shape).astype("float32") + input_handle.reshape(shape) + input_handle.copy_from_cpu(self.fake_input) + predictor.run() + output_names = predictor.get_output_names() + res = [] + for out_name in output_names: + output_handle = predictor.get_output_handle(out_name) + output_data = output_handle.copy_to_cpu() + res.append(output_data) + + if len(output_names) == 1: + res = res[0] + + return res + + +class TestDropout(UnittestBase): + def init_info(self): + self.shapes = [[10, 10]] + self.save_path = os.path.join(self.temp_dir.name, 'dropout') + + def test_static(self): + main_prog = Program() + startup_prog = Program() + with program_guard(main_prog, startup_prog): + fc = paddle.nn.Linear(10, 10) + x = paddle.randn(self.shapes[0]) + x.stop_gradient = False + feat = fc(x) + # p is a Variable + p = paddle.randn([1]) + out = paddle.nn.functional.dropout(feat, p=p) + sgd = paddle.optimizer.SGD() + sgd.minimize(paddle.mean(out)) + # test _to_string + self.assertTrue("Var[" in str(main_prog)) + + exe = paddle.static.Executor() + exe.run(startup_prog) + res = exe.run(fetch_list=[x, out]) + # export model + paddle.static.save_inference_model(self.save_path, [x], [out], exe) + + # Test for Inference Predictor + infer_out = self.infer_prog() + self.assertEqual(infer_out.shape, (10, 10)) + + self.assertEqual( + main_prog.block(0).ops[4].all_attrs()['dropout_prob'].name, + p.name, + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/deprecated/legacy_test/test_eye_op.py b/test/deprecated/legacy_test/test_eye_op.py index 41a4e6aea2f9d..cafbfbd96beb0 100644 --- a/test/deprecated/legacy_test/test_eye_op.py +++ b/test/deprecated/legacy_test/test_eye_op.py @@ -17,7 +17,7 @@ import numpy as np from op_test import OpTest -from test_attribute_var import UnittestBase +from test_attribute_var_deprecated import UnittestBase import paddle from paddle import base diff --git a/test/deprecated/legacy_test/test_inference_model_io.py b/test/deprecated/legacy_test/test_inference_model_io.py index 2e179cf90276e..3b9d486e791e4 100644 --- a/test/deprecated/legacy_test/test_inference_model_io.py +++ b/test/deprecated/legacy_test/test_inference_model_io.py @@ -29,6 +29,7 @@ load_inference_model_distributed, save_persistables, ) +from paddle.pir_utils import test_with_pir_api from paddle.static.io import load_inference_model, save_inference_model paddle.enable_static() @@ -161,14 +162,15 @@ def test_fit_line_inference_model(self): class TestSaveInferenceModel(unittest.TestCase): + @test_with_pir_api def test_save_inference_model(self): root_path = tempfile.TemporaryDirectory() MODEL_DIR = os.path.join(root_path.name, "inference_model2") - init_program = Program() - program = Program() + init_program = paddle.static.Program() + program = paddle.static.Program() # fake program without feed/fetch - with program_guard(program, init_program): + with paddle.static.program_guard(program, init_program): x = paddle.static.data(name='x', shape=[-1, 2], dtype='float32') y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32') @@ -188,14 +190,15 @@ def test_save_inference_model(self): ) root_path.cleanup() + @test_with_pir_api def test_save_inference_model_with_auc(self): root_path = tempfile.TemporaryDirectory() MODEL_DIR = os.path.join(root_path.name, "inference_model4") - init_program = Program() - program = Program() + init_program = paddle.static.Program() + program = paddle.static.Program() # fake program without feed/fetch - with program_guard(program, init_program): + with paddle.static.program_guard(program, init_program): x = paddle.static.data(name='x', shape=[-1, 2], dtype='float32') y = paddle.static.data(name='y', shape=[-1, 1], dtype='int32') predict = paddle.static.nn.fc(x, size=2, activation='softmax') @@ -223,14 +226,15 @@ def test_save_inference_model_with_auc(self): class TestInstance(unittest.TestCase): + # @test_with_pir_api def test_save_inference_model(self): root_path = tempfile.TemporaryDirectory() MODEL_DIR = os.path.join(root_path.name, "inference_model3") - init_program = Program() - program = Program() + init_program = paddle.static.Program() + program = paddle.static.Program() # fake program without feed/fetch - with program_guard(program, init_program): + with paddle.static.program_guard(program, init_program): x = paddle.static.data(name='x', shape=[-1, 2], dtype='float32') y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32') @@ -261,14 +265,15 @@ def test_save_inference_model(self): class TestSaveInferenceModelNew(unittest.TestCase): + # @test_with_pir_api def test_save_and_load_inference_model(self): root_path = tempfile.TemporaryDirectory() MODEL_DIR = os.path.join(root_path.name, "inference_model5") - init_program = base.default_startup_program() - program = base.default_main_program() + init_program = paddle.static.default_startup_program() + program = paddle.static.default_main_program() # fake program without feed/fetch - with program_guard(program, init_program): + with paddle.static.program_guard(program, init_program): x = paddle.static.data(name='x', shape=[-1, 2], dtype='float32') y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32') @@ -283,7 +288,7 @@ def test_save_and_load_inference_model(self): sgd_optimizer.minimize(avg_cost, init_program) place = core.CPUPlace() - exe = executor.Executor(place) + exe = base.Executor(place) exe.run(init_program, feed={}, fetch_list=[]) tensor_x = np.array([[1, 1], [1, 2], [5, 2]]).astype("float32") @@ -344,7 +349,12 @@ def test_save_and_load_inference_model(self): exe, ) - model_path = MODEL_DIR + "_isdir.pdmodel" + if paddle.framework.in_pir_mode(): + MODEL_SUFFIX = ".json" + else: + MODEL_SUFFIX = ".pdmodel" + + model_path = MODEL_DIR + "_isdir" + MODEL_SUFFIX os.makedirs(model_path) self.assertRaises( ValueError, @@ -356,7 +366,7 @@ def test_save_and_load_inference_model(self): ) os.rmdir(model_path) - params_path = MODEL_DIR + "_isdir.pdmodel" + params_path = MODEL_DIR + "_isdir" + MODEL_SUFFIX os.makedirs(params_path) self.assertRaises( ValueError, @@ -372,7 +382,7 @@ def test_save_and_load_inference_model(self): MODEL_DIR, [x, y], [avg_cost], exe ) - self.assertTrue(os.path.exists(MODEL_DIR + ".pdmodel")) + self.assertTrue(os.path.exists(MODEL_DIR + MODEL_SUFFIX)) self.assertTrue(os.path.exists(MODEL_DIR + ".pdiparams")) expected = exe.run( @@ -405,7 +415,7 @@ def test_save_and_load_inference_model(self): unsupported_param=None, ) self.assertRaises( - (TypeError, ValueError), + (TypeError, RuntimeError, ValueError), paddle.static.load_inference_model, None, exe, @@ -435,7 +445,7 @@ def test_save_and_load_inference_model(self): self.assertRaises(ValueError, paddle.static.io.save_to_file, '', 123) # test _get_valid_program self.assertRaises(TypeError, paddle.static.io._get_valid_program, 0) - p = Program() + p = paddle.static.Program() cp = CompiledProgram(p) paddle.static.io._get_valid_program(cp) self.assertTrue(paddle.static.io._get_valid_program(cp) is p) @@ -491,12 +501,13 @@ def test_serialize_program_and_persistables(self): None, ) + @test_with_pir_api def test_normalize_program(self): - init_program = base.default_startup_program() - program = base.default_main_program() + init_program = paddle.static.default_startup_program() + program = paddle.static.default_main_program() # fake program without feed/fetch - with program_guard(program, init_program): + with paddle.static.program_guard(program, init_program): x = paddle.static.data(name='x', shape=[-1, 2], dtype='float32') y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32') @@ -525,7 +536,7 @@ def test_normalize_program(self): # test if return type of serialize_program is bytes res = paddle.static.normalize_program(program, [x, y], [avg_cost]) - self.assertTrue(isinstance(res, Program)) + self.assertTrue(isinstance(res, paddle.static.Program)) # test program type self.assertRaises( TypeError, paddle.static.normalize_program, None, [x, y], [avg_cost] @@ -545,6 +556,7 @@ def test_normalize_program(self): class TestLoadInferenceModelError(unittest.TestCase): + @test_with_pir_api def test_load_model_not_exist(self): place = core.CPUPlace() exe = executor.Executor(place) diff --git a/test/deprecated/legacy_test/test_inverse_op.py b/test/deprecated/legacy_test/test_inverse_op.py index 22810eecee07d..54f8466bd4d02 100644 --- a/test/deprecated/legacy_test/test_inverse_op.py +++ b/test/deprecated/legacy_test/test_inverse_op.py @@ -35,6 +35,12 @@ def setUp(self): np.random.seed(123) mat = np.random.random(self.matrix_shape).astype(self.dtype) + if self.dtype == 'complex64' or self.dtype == 'complex128': + mat = ( + np.random.random(self.matrix_shape) + + 1j * np.random.random(self.matrix_shape) + ).astype(self.dtype) + inverse = np.linalg.inv(mat) self.inputs = {'Input': mat} @@ -92,6 +98,26 @@ def config(self): self.python_api = paddle.tensor.math.inverse +class TestInverseOpComplex64(TestInverseOp): + def config(self): + self.matrix_shape = [10, 10] + self.dtype = "complex64" + self.python_api = paddle.tensor.math.inverse + + def test_grad(self): + self.check_grad(['Input'], 'Output', check_pir=True) + + +class TestInverseOpComplex128(TestInverseOp): + def config(self): + self.matrix_shape = [10, 10] + self.dtype = "complex128" + self.python_api = paddle.tensor.math.inverse + + def test_grad(self): + self.check_grad(['Input'], 'Output', check_pir=True) + + class TestInverseAPI(unittest.TestCase): def setUp(self): np.random.seed(123) diff --git a/test/deprecated/legacy_test/test_multinomial_op.py b/test/deprecated/legacy_test/test_multinomial_op.py index f6fc6e281193b..48c00ed5506e5 100644 --- a/test/deprecated/legacy_test/test_multinomial_op.py +++ b/test/deprecated/legacy_test/test_multinomial_op.py @@ -17,7 +17,7 @@ import numpy as np from op_test import OpTest, convert_float_to_uint16 -from test_attribute_var import UnittestBase +from test_attribute_var_deprecated import UnittestBase import paddle from paddle import base diff --git a/test/deprecated/legacy_test/test_squared_l2_norm_op.py b/test/deprecated/legacy_test/test_squared_l2_norm_op.py deleted file mode 100755 index df36c81097051..0000000000000 --- a/test/deprecated/legacy_test/test_squared_l2_norm_op.py +++ /dev/null @@ -1,148 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -from numpy import linalg as LA -from op_test import OpTest - -import paddle -import paddle.distributed as dist -from paddle import _C_ops, _legacy_C_ops -from paddle.framework import in_dynamic_mode - - -def test_squared_l2_norm(x): - if in_dynamic_mode(): - return _C_ops.squared_l2_norm(x) - else: - return _legacy_C_ops.squared_l2_norm(x) - - -class TestSquaredL2NormF16Op(unittest.TestCase): - def init_test_case(self): - X = np.random.uniform(-0.1, 0.1, (8, 5, 10)).astype('float32') - return X - - def check_main(self, x_np, dtype): - paddle.disable_static() - x = paddle.to_tensor(x_np) - - x.stop_gradient = False - y = test_squared_l2_norm(x) - x_g = paddle.grad(y, [x]) - - paddle.enable_static() - return y, x_g - - def test_main(self): - x_np = self.init_test_case() - y_np_1, x_g_np_1 = self.check_main(x_np, 'float32') - y_np_2, x_g_np_2 = self.check_main(x_np, 'float16') - - def assert_equal(x, y): - np.testing.assert_allclose(x, y, rtol=1e-05, atol=0.0) - - assert_equal(y_np_1, y_np_2) - assert_equal(x_g_np_1, x_g_np_2) - - -class TestSquaredL2NormF16Op1(TestSquaredL2NormF16Op): - def init_test_case(self): - X = np.random.uniform(-2.0, 2.0, (30, 10)).astype('float32') - return X - - -class TestSquaredL2NormF16Op2(TestSquaredL2NormF16Op): - def init_test_case(self): - X = np.random.uniform(-5.0, 5.0, (20, 10, 20)).astype('float32') - return X - - -class TestL2LossOp(OpTest): - """Test squared_l2_norm""" - - def config(self): - self.x_shape = (13, 19) - self.check_auto_parallel = False - - def setUp(self): - self.config() - self.python_api = test_squared_l2_norm - self.op_type = "squared_l2_norm" - self.max_relative_error = 0.05 - - X = np.random.uniform(-1, 1, self.x_shape).astype("float32") - X[np.abs(X) < self.max_relative_error] = 0.1 - self.inputs = {'X': X} - self.outputs = {'Out': np.array([np.square(LA.norm(X))])} - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad( - ['X'], - 'Out', - max_relative_error=self.max_relative_error, - check_auto_parallel=self.check_auto_parallel, - ) - - -class TestSquaredL2NormAutoParallel_1(TestL2LossOp): - def config(self): - self.x_shape = (14, 18) - self.check_auto_parallel = True - self.placements = { - 'X': [dist.Replicate()], - } - - -class TestSquaredL2NormAutoParallel_2(TestL2LossOp): - def config(self): - self.x_shape = (14, 18) - self.check_auto_parallel = True - self.placements = { - 'X': [dist.Shard(0)], - } - - -class TestSquaredL2NormAutoParallel_3(TestL2LossOp): - def config(self): - self.x_shape = (14, 18) - self.check_auto_parallel = True - self.placements = { - 'X': [dist.Shard(1)], - } - - -class TestL2LossDeterministic(unittest.TestCase): - def check_place(self, place): - with paddle.base.dygraph.guard(place): - x_np = np.random.rand(5, 11, 13).astype('float32') - x = paddle.to_tensor(x_np) - y1 = _legacy_C_ops.squared_l2_norm(x) - y2 = _legacy_C_ops.squared_l2_norm(x) - np.testing.assert_array_equal(y1.numpy(), y2.numpy()) - - def test_main(self): - self.check_place(paddle.CPUPlace()) - if paddle.is_compiled_with_cuda(): - self.check_place(paddle.CUDAPlace(0)) - - -if __name__ == "__main__": - paddle.enable_static() - unittest.main() diff --git a/test/deprecated/legacy_test/test_squeeze2_op_rename.py b/test/deprecated/legacy_test/test_squeeze2_op_rename.py index ed347eda7350b..02e63c0cb2459 100644 --- a/test/deprecated/legacy_test/test_squeeze2_op_rename.py +++ b/test/deprecated/legacy_test/test_squeeze2_op_rename.py @@ -15,7 +15,7 @@ import os import unittest -from test_attribute_var import UnittestBase +from test_attribute_var_deprecated import UnittestBase import paddle from paddle.base.framework import Program, program_guard diff --git a/test/distributed_passes/test_fuse_allreduce_split_to_reducescatter_pass.py b/test/distributed_passes/test_fuse_allreduce_split_to_reducescatter_pass.py index b36a5121d2e82..5127589c36396 100644 --- a/test/distributed_passes/test_fuse_allreduce_split_to_reducescatter_pass.py +++ b/test/distributed_passes/test_fuse_allreduce_split_to_reducescatter_pass.py @@ -22,7 +22,7 @@ (%38) = "pd_op.data" () {dtype:(pd_op.DataType)bfloat16,name:"linear_0.tmp_0",persistable:[false],place:(pd_op.Place)Place(gpu:0),shape:(pd_op.IntArray)[4096,1,28672],stop_gradient:[false]} : () -> builtin.tensor<4096x1x28672xbf16> (%48) = "pd_op.data" () {dtype:(pd_op.DataType)bfloat16,name:"input",persistable:[false],place:(pd_op.Place)Place(gpu:0),shape:(pd_op.IntArray)[4096,1,28672],stop_gradient:[false]} : () -> builtin.tensor<4096x1x28672xbf16> (%50) = "pd_op.matmul" (%48, %2) {persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:true} : (builtin.tensor<4096x1x28672xbf16>, builtin.tensor<8192x28672xbf16>) -> builtin.tensor<4096x1x8192xbf16> - (%57) = "pd_op.c_allreduce_sum_" (%50) {persistable:[false],ring_id:(Int32)36,stop_gradient:[false],use_calc_stream:true,use_model_parallel:true} : (builtin.tensor<4096x1x8192xbf16>) -> builtin.tensor<4096x1x8192xbf16> + (%57) = "pd_op.c_allreduce_sum_" (%50) {event_to_record:"event_7989",events_to_wait:[],execution_stream:"auto_parallel_mp",force_record_event:false,persistable:[false],ring_id:(Int32)36,stop_gradient:[false],use_calc_stream:true,use_model_parallel:true} : (builtin.tensor<4096x1x8192xbf16>) -> builtin.tensor<4096x1x8192xbf16> (%63) = "pd_op.assign" (%57) {persistable:[false],stop_gradient:[false]} : (builtin.tensor<4096x1x8192xbf16>) -> builtin.tensor<4096x1x8192xbf16> (%64) = "pd_op.full" () {dtype:(pd_op.DataType)int32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)0} : () -> builtin.tensor<1xi32> (%65) = "pd_op.split_with_num" (%63, %64) {num:(Int32)2,persistable:[false],stop_gradient:[false]} : (builtin.tensor<4096x1x8192xbf16>, builtin.tensor<1xi32>) -> vec[builtin.tensor<2048x1x8192xbf16>,builtin.tensor<2048x1x8192xbf16>] diff --git a/test/distribution/test_distribution_student_t.py b/test/distribution/test_distribution_student_t.py new file mode 100644 index 0000000000000..900e47cea2428 --- /dev/null +++ b/test/distribution/test_distribution_student_t.py @@ -0,0 +1,274 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import parameterize +import scipy.stats +from distribution import config +from parameterize import ( + TEST_CASE_NAME, + parameterize_cls, + parameterize_func, +) + +import paddle +from paddle.distribution.student_t import StudentT + + +@parameterize.place(config.DEVICES) +@parameterize.parameterize_cls( + (parameterize.TEST_CASE_NAME, 'df', 'loc', 'scale'), + [ + ( + 'one-dim', + 10.0, + 1.0, + 2.0, + ), + ( + 'multi-dim', + parameterize.xrand((2, 1), dtype='float32', min=4, max=30), + parameterize.xrand((2, 3), dtype='float32', min=1, max=10), + parameterize.xrand((2, 3), dtype='float32', min=0.1, max=3), + ), + ( + 'multi-dim2', + parameterize.xrand((2, 1), dtype='float64', min=4, max=30), + parameterize.xrand((2, 3), dtype='float64', min=-10, max=-1), + parameterize.xrand((2, 3), dtype='float64', min=0.1, max=3), + ), + ], +) +class TestStudentT(unittest.TestCase): + def setUp(self): + df = ( + self.df if isinstance(self.df, float) else paddle.to_tensor(self.df) + ) + loc = ( + self.loc + if isinstance(self.loc, float) + else paddle.to_tensor(self.loc) + ) + scale = ( + self.scale + if isinstance(self.scale, float) + else paddle.to_tensor(self.scale) + ) + self._dist = StudentT(df, loc, scale) + + def test_mean(self): + mean = self._dist.mean + target_dtype = ( + "float32" if isinstance(self.df, float) else self.df.dtype + ) + self.assertEqual(mean.numpy().dtype, target_dtype) + np.testing.assert_allclose( + mean, + self._np_mean(), + rtol=config.RTOL.get(str(target_dtype)), + atol=config.ATOL.get(str(target_dtype)), + ) + + def test_variance(self): + var = self._dist.variance + target_dtype = ( + "float32" if isinstance(self.df, float) else self.df.dtype + ) + self.assertEqual(var.numpy().dtype, target_dtype) + np.testing.assert_allclose( + var, + self._np_variance(), + rtol=config.RTOL.get(str(target_dtype)), + atol=config.ATOL.get(str(target_dtype)), + ) + + def test_entropy(self): + entropy = self._dist.entropy() + target_dtype = ( + "float32" if isinstance(self.df, float) else self.df.dtype + ) + self.assertEqual(entropy.numpy().dtype, target_dtype) + np.testing.assert_allclose( + entropy, + self._np_entropy(), + rtol=config.RTOL.get(str(target_dtype)), + atol=config.ATOL.get(str(target_dtype)), + ) + + def test_sample(self): + sample_shape = () + samples = self._dist.sample(sample_shape) + self.assertEqual( + tuple(samples.shape), + sample_shape + self._dist.batch_shape + self._dist.event_shape, + ) + + sample_shape = (10000,) + samples = self._dist.sample(sample_shape) + sample_mean = samples.mean(axis=0) + sample_variance = samples.var(axis=0) + + # Tolerance value 0.1 is empirical value which is consistent with + # TensorFlow + np.testing.assert_allclose( + sample_mean, self._dist.mean, atol=0, rtol=0.10 + ) + # Tolerance value 0.1 is empirical value which is consistent with + # TensorFlow + np.testing.assert_allclose( + sample_variance, self._dist.variance, atol=0, rtol=0.10 + ) + + def _np_variance(self): + if isinstance(self.df, np.ndarray) and self.df.dtype == np.float32: + df = self.df.astype("float64") + else: + df = self.df + if isinstance(self.loc, np.ndarray) and self.loc.dtype == np.float32: + loc = self.loc.astype("float64") + else: + loc = self.loc + if ( + isinstance(self.scale, np.ndarray) + and self.scale.dtype == np.float32 + ): + scale = self.scale.astype("float64") + else: + scale = self.scale + return scipy.stats.t.var(df, loc, scale) + + def _np_mean(self): + if isinstance(self.df, np.ndarray) and self.df.dtype == np.float32: + df = self.df.astype("float64") + else: + df = self.df + if isinstance(self.loc, np.ndarray) and self.loc.dtype == np.float32: + loc = self.loc.astype("float64") + else: + loc = self.loc + if ( + isinstance(self.scale, np.ndarray) + and self.scale.dtype == np.float32 + ): + scale = self.scale.astype("float64") + else: + scale = self.scale + return scipy.stats.t.mean(df, loc, scale) + + def _np_entropy(self): + if isinstance(self.df, np.ndarray) and self.df.dtype == np.float32: + df = self.df.astype("float64") + else: + df = self.df + if isinstance(self.loc, np.ndarray) and self.loc.dtype == np.float32: + loc = self.loc.astype("float64") + else: + loc = self.loc + if ( + isinstance(self.scale, np.ndarray) + and self.scale.dtype == np.float32 + ): + scale = self.scale.astype("float64") + else: + scale = self.scale + return scipy.stats.t.entropy(df, loc, scale) + + +@parameterize.place(config.DEVICES) +@parameterize.parameterize_cls( + (parameterize.TEST_CASE_NAME, 'df', 'loc', 'scale', 'value'), + [ + ( + 'one-dim', + 10.0, + 0.0, + 1.0, + np.array(3.3).astype("float32"), + ), + ( + 'value-broadcast-shape', + parameterize.xrand((2, 1), dtype='float64', min=4, max=30), + parameterize.xrand((2, 1), dtype='float64', min=-10, max=10), + parameterize.xrand((2, 1), dtype='float64', min=0.1, max=5), + parameterize.xrand((2, 4), dtype='float64', min=-10, max=10), + ), + ], +) +class TestStudentTProbs(unittest.TestCase): + def setUp(self): + df = ( + self.df if isinstance(self.df, float) else paddle.to_tensor(self.df) + ) + loc = ( + self.loc + if isinstance(self.loc, float) + else paddle.to_tensor(self.loc) + ) + scale = ( + self.scale + if isinstance(self.scale, float) + else paddle.to_tensor(self.scale) + ) + self._dist = StudentT(df, loc, scale) + + def test_prob(self): + target_dtype = ( + "float32" if isinstance(self.df, float) else self.df.dtype + ) + np.testing.assert_allclose( + self._dist.prob(paddle.to_tensor(self.value)), + scipy.stats.t.pdf(self.value, self.df, self.loc, self.scale), + rtol=config.RTOL.get(str(target_dtype)), + atol=config.ATOL.get(str(target_dtype)), + ) + + def test_log_prob(self): + target_dtype = ( + "float32" if isinstance(self.df, float) else self.df.dtype + ) + np.testing.assert_allclose( + self._dist.log_prob(paddle.to_tensor(self.value)), + scipy.stats.t.logpdf(self.value, self.df, self.loc, self.scale), + rtol=config.RTOL.get(str(target_dtype)), + atol=config.ATOL.get(str(target_dtype)), + ) + + +@parameterize.place(config.DEVICES) +@parameterize_cls([TEST_CASE_NAME], ['StudentTTestError']) +class StudentTTestError(unittest.TestCase): + def setUp(self): + paddle.disable_static(self.place) + + @parameterize_func( + [ + (-5.0, 0.0, 1.0, ValueError), # negative df + (5.0, 0.0, -1.0, ValueError), # negative scale + ] + ) + def test_bad_parameter(self, df, loc, scale, error): + with paddle.base.dygraph.guard(self.place): + self.assertRaises(error, StudentT, df, loc, scale) + + @parameterize_func([(10,)]) # not sequence object sample shape + def test_bad_sample_shape(self, shape): + with paddle.base.dygraph.guard(self.place): + t = StudentT(5.0, 0.0, 1.0) + self.assertRaises(TypeError, t.sample, shape) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/dygraph_to_static/test_typehint.py b/test/dygraph_to_static/test_typehint.py index fd4dbacc6ad6d..b84ce4f332a91 100644 --- a/test/dygraph_to_static/test_typehint.py +++ b/test/dygraph_to_static/test_typehint.py @@ -35,15 +35,15 @@ def function(x: A) -> A: def fn_annotation_assign_with_value(x: paddle.Tensor): if x: - y: List["paddle.Tensor"] = [x + 1] + y: List[paddle.Tensor] = [x + 1] else: - y: List["paddle.Tensor"] = [x - 1] + y: List[paddle.Tensor] = [x - 1] return y def fn_annotation_assign_without_value(x: paddle.Tensor): if x: - y: List["paddle.Tensor"] + y: List[paddle.Tensor] y = [x + 1] else: y = [x - 1] diff --git a/test/ir/inference/quant_dequant_test.py b/test/ir/inference/quant_dequant_test.py index c176e802a525c..85724a2cc7df2 100644 --- a/test/ir/inference/quant_dequant_test.py +++ b/test/ir/inference/quant_dequant_test.py @@ -22,9 +22,10 @@ import paddle from paddle import base -from paddle.base import Program, Variable, core +from paddle.base import core from paddle.base.core import AnalysisConfig, create_paddle_predictor from paddle.base.framework import IrGraph +from paddle.static import Variable from paddle.static.io import append_fetch_ops, prepend_feed_ops from paddle.static.quantization import ( AddQuantDequantPass, @@ -39,10 +40,10 @@ class QuantDequantTest(unittest.TestCase): def __init__(self, methodName='runTest'): super().__init__(methodName) paddle.enable_static() - self.main_program = base.Program() - self.startup_program = base.Program() - self.test_main_program = base.Program() - self.test_startup_program = base.Program() + self.main_program = paddle.static.Program() + self.startup_program = paddle.static.Program() + self.test_main_program = paddle.static.Program() + self.test_startup_program = paddle.static.Program() self.feeds = None self.fetch_list = None self.enable_mkldnn = False @@ -62,10 +63,9 @@ def __init__(self, methodName='runTest'): # from Paddle release2.1 def _normalize_program(self, program, feed_vars, fetch_vars): - if not isinstance(program, Program): + if not isinstance(program, paddle.static.Program): raise TypeError( - "program type must be `base.Program`, but received `%s`" - % type(program) + f"program type must be `paddle.static.Program`, but received `{type(program)}`" ) if not isinstance(feed_vars, list): feed_vars = [feed_vars] @@ -127,7 +127,7 @@ def _save_models( if var.name in feeded_var_names: feeded_vars.append(var) - with base.scope_guard(scope): + with paddle.static.scope_guard(scope): paddle.static.io.save_inference_model( dirname, feeded_vars, @@ -155,7 +155,7 @@ def _get_paddle_outs(self, feed, fetch_list, executor, program, scope): ''' Return PaddlePaddle outputs. ''' - with base.scope_guard(scope): + with paddle.static.scope_guard(scope): outs = executor.run( program=program, feed=feed, @@ -245,12 +245,12 @@ def check_output_with_option( or disable TensorRT, enable MKLDNN or disable MKLDNN are all the same. ''' - place = base.CUDAPlace(0) if use_gpu else base.CPUPlace() - executor = base.Executor(place) - scope = base.Scope() + place = paddle.CUDAPlace(0) if use_gpu else paddle.CPUPlace() + executor = paddle.static.Executor(place) + scope = paddle.static.Scope() device = "GPU" if use_gpu else "CPU" - with base.scope_guard(scope): + with paddle.static.scope_guard(scope): executor.run(self.startup_program) executor.run(self.test_startup_program) main_graph = IrGraph(core.Graph(self.main_program.desc), for_test=False) @@ -274,11 +274,11 @@ def check_output_with_option( scale_training_pass = OutScaleForTrainingPass(scope=scope, place=place) scale_training_pass.apply(main_graph) - build_strategy = base.BuildStrategy() + build_strategy = paddle.static.BuildStrategy() build_strategy.memory_optimize = False build_strategy.enable_inplace = False build_strategy.fuse_all_reduce_ops = False - binary = base.CompiledProgram(main_graph.graph) + binary = paddle.static.CompiledProgram(main_graph.graph) iters = 10 batch_size = 1 @@ -287,7 +287,7 @@ def check_output_with_option( batch_size=batch_size, ) feeder = base.DataFeeder(feed_list=[self.data, self.label], place=place) - with base.scope_guard(scope): + with paddle.static.scope_guard(scope): for _ in range(iters): data = next(train_reader()) loss_v = executor.run( @@ -307,7 +307,7 @@ def check_output_with_option( self.main_program = test_graph.to_program() - with base.scope_guard(scope): + with paddle.static.scope_guard(scope): self.main_program = self._normalize_program( self.main_program, self.data, self.fetch_list ) @@ -450,6 +450,6 @@ def __init__( self.disable_trt_plugin_fp16 = disable_trt_plugin_fp16 def quant_dequant(self): - place = base.CPUPlace() - exe = base.Executor(place) - scope = base.Scope() + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + scope = paddle.static.Scope() diff --git a/test/ir/pir/cinn/sub_graphs/base.py b/test/ir/pir/cinn/sub_graphs/base.py index a11ffe4f9e1bd..a0ceee03095db 100644 --- a/test/ir/pir/cinn/sub_graphs/base.py +++ b/test/ir/pir/cinn/sub_graphs/base.py @@ -30,7 +30,7 @@ def setUp(self): self.atol = 1e-6 self.train_atol = 1e-6 self.with_precision_compare = True - self.with_train = False # 本个pr中默认为false,下个增量pr中改为默认true + self.with_train = True # 本个pr中默认为false,下个增量pr中改为默认true # override customized settting self.init() if self.inputs: diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_0.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_0.py index e5d86d0e40f53..228465812c587 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_0.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_0.py @@ -135,7 +135,6 @@ def init(self): paddle.rand(shape=[22, 512, 7, 7], dtype=paddle.float32), ) self.net = LayerCase - self.with_train = True def set_flags(self): # NOTE(Aurelius84): cinn_op.pool2d only support pool_type='avg' under adaptive=True diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_1.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_1.py index 10ed97211646c..d40e635bca9ed 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_1.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_1.py @@ -62,6 +62,7 @@ def init(self): paddle.rand(shape=[10, 512, 7, 7], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_10.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_10.py index c151d478a6ac6..b871017d1e038 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_10.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_10.py @@ -75,6 +75,7 @@ def init(self): paddle.rand(shape=[10, 36, 28, 28], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_11.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_11.py index 464ab6166a0fa..83fd4bff996bc 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_11.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_11.py @@ -65,6 +65,7 @@ def init(self): paddle.rand(shape=[10, 1280, 1, 1], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_13.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_13.py index 24d79ccfc8e94..dd91f88558b59 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_13.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_13.py @@ -60,6 +60,7 @@ def init(self): paddle.rand(shape=[10, 2048, 7, 7], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_14.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_14.py index 167b10dd6df2f..7708b6fb6c2bb 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_14.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_14.py @@ -72,6 +72,7 @@ def init(self): paddle.rand(shape=[22, 128, 56, 56], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_15.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_15.py index c5050e5cb9d55..4d1ac693615d3 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_15.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_15.py @@ -72,6 +72,7 @@ def init(self): paddle.rand(shape=[10, 122, 28, 28], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_16.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_16.py index 5fad58c5de16b..3e6696a5f23c9 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_16.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_16.py @@ -115,6 +115,7 @@ def init(self): paddle.rand(shape=[22, 28, 56, 56], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False def set_flags(self): # NOTE(Aurelius84): cinn_op.pool2d only support pool_type='avg' under adaptive=True diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_17.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_17.py index 5dc0d861cc847..62ef8a2dbe38c 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_17.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_17.py @@ -60,6 +60,7 @@ def init(self): paddle.rand(shape=[22, 2048, 7, 7], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_18.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_18.py index b4010043304be..e8f4772b757a5 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_18.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_18.py @@ -68,6 +68,7 @@ def init(self): paddle.rand(shape=[22, 1536, 8, 8], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False self.with_precision_compare = False # NOTE output mismatch with prim diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_2.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_2.py index d3faccc973b03..883067279e417 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_2.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_2.py @@ -74,7 +74,6 @@ def init(self): paddle.rand(shape=[43, 256, 56, 56], dtype=paddle.float32), ) self.net = LayerCase - self.with_train = True def set_flags(self): # NOTE(Aurelius84): cinn_op.pool2d only support pool_type='avg' under adaptive=True diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_20.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_20.py index 57dcec3e56353..82523d9dd29e4 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_20.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_20.py @@ -77,6 +77,7 @@ def init(self): paddle.rand(shape=[86, 192], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_21.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_21.py index 49eea1bd4cbfd..b19151557a65a 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_21.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_21.py @@ -108,6 +108,7 @@ def init(self): ] self.inputs = (paddle.rand(shape=[86, 198, 192], dtype=paddle.float32),) self.net = LayerCase + self.with_train = False # NOTE output mismatch with prim diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_23.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_23.py index 83ddc2b51b2b8..b37c912b61f5d 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_23.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_23.py @@ -60,6 +60,7 @@ def init(self): paddle.rand(shape=[11, 24, 56, 56], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_25.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_25.py index b434f440365f6..d6be0ea181c59 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_25.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_25.py @@ -68,6 +68,7 @@ def init(self): paddle.rand(shape=[11, 1280, 7, 7], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False self.with_precision_compare = False diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_28.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_28.py index 6a25c112a0b47..5387f9ee37177 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_28.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_28.py @@ -68,6 +68,7 @@ def init(self): paddle.rand(shape=[10, 320, 8, 8], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False self.with_precision_compare = False # NOTE prim + cinn lead to error diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_29.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_29.py index 85b2207fd1ee1..9283f453e46ae 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_29.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_29.py @@ -68,6 +68,7 @@ def init(self): paddle.rand(shape=[10, 2048, 10, 10], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False self.with_precision_compare = False diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_3.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_3.py index 23b9ec755c7be..9c538dea0d694 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_3.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_3.py @@ -89,6 +89,7 @@ def init(self): paddle.randint(low=0, high=10, shape=[16, 49], dtype=paddle.int64), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_31.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_31.py index 81d18df09b741..eee47cf931cd9 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_31.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_31.py @@ -66,6 +66,7 @@ def init(self): paddle.rand(shape=[22, 288, 14, 14], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False self.atol = 1e-8 diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_32.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_32.py index 7586bd7c8cd37..2bed2bfc9a742 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_32.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_32.py @@ -54,6 +54,7 @@ def init(self): paddle.rand(shape=[22, 1024, 1, 1], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_33.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_33.py index 0d50f420cdc22..55b168f5e2ade 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_33.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_33.py @@ -84,6 +84,7 @@ def init(self): paddle.rand(shape=[10, 256, 14, 14], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False self.atol = 1e-5 diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_34.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_34.py index 7466135585abd..a8d09423a95eb 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_34.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_34.py @@ -57,6 +57,7 @@ def init(self): paddle.rand(shape=[10, 32, 56, 56], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False self.atol = 1e-8 diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_35.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_35.py index 7eb05d010bd2f..8c70aa1f75ae2 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_35.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_35.py @@ -84,6 +84,7 @@ def init(self): paddle.rand(shape=[4, 3, 384, 384], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_36.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_36.py index 03f141b241bdc..6abd8655d98f6 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_36.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_36.py @@ -70,6 +70,7 @@ def init(self): ] self.inputs = (paddle.rand(shape=[6, 9216, 96], dtype=paddle.float32),) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_38.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_38.py index 431650d6bdbef..828f15fa32c3b 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_38.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_38.py @@ -48,6 +48,7 @@ def init(self): paddle.rand(shape=[4, 48, 96, 96], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_39.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_39.py index ddd3cdf8c3eda..44431cb437d82 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_39.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_39.py @@ -46,6 +46,7 @@ def init(self): ] self.inputs = (paddle.rand(shape=[12, 288, 192], dtype=paddle.float32),) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_4.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_4.py index 9d419dbb38959..f03c8322cce70 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_4.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_4.py @@ -51,6 +51,7 @@ def init(self): ] self.inputs = (paddle.rand(shape=[22, 196, 128], dtype=paddle.float32),) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_41.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_41.py index 352f81b791d41..d3d09e75e4f70 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_41.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_41.py @@ -66,6 +66,7 @@ def init(self): paddle.randint(low=0, high=10, shape=[2], dtype=paddle.int32), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_42.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_42.py index 0e8a6574081a4..60d3846377987 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_42.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_42.py @@ -114,6 +114,7 @@ def init(self): paddle.rand(shape=[2, 4], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False def set_flags(self): # NOTE(Aurelius84): cinn_op.pool2d only support pool_type='avg' under adaptive=True diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_43.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_43.py index 0104a18d75d60..9440b6cb9dbd5 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_43.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_43.py @@ -258,6 +258,7 @@ def init(self): paddle.rand(shape=[1, 2048, 24, 36], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False self.atol = 1e-5 diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_44.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_44.py index 06c021953fd1e..34416aea9ae97 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_44.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_44.py @@ -143,6 +143,7 @@ def init(self): paddle.rand(shape=[1, 100, 256], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False self.atol = 1e-8 self.with_cinn = False diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_45.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_45.py index 8c9802242f436..d2f6befdc9147 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_45.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_45.py @@ -70,6 +70,7 @@ def init(self): ] self.inputs = (paddle.rand(shape=[1, 4], dtype=paddle.float32),) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_46.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_46.py index 6e45b88c332da..19ec352bcf5d4 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_46.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_46.py @@ -62,6 +62,7 @@ def init(self): paddle.rand(shape=[1, 80, 50, 50], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False # NOTE prim + cinn lead to error diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_47.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_47.py index 72599e85f742f..5096d5f366b63 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_47.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_47.py @@ -47,6 +47,7 @@ def init(self): paddle.randint(low=0, high=10, shape=[2], dtype=paddle.int64), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_48.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_48.py index eaa9d3e6b9232..7fc4b64f1466f 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_48.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_48.py @@ -190,6 +190,7 @@ def init(self): paddle.rand(shape=[1, 625, 1], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False self.atol = 1e-5 diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_49.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_49.py index 34ecd19552529..4367e45015b23 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_49.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_49.py @@ -66,6 +66,7 @@ def init(self): paddle.rand(shape=[1], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_5.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_5.py index 7c9639d906cda..181d06fffb4c3 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_5.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_5.py @@ -46,6 +46,7 @@ def init(self): ] self.inputs = (paddle.rand(shape=[22, 16, 384], dtype=paddle.float32),) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_51.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_51.py index 10ab5da982012..152dc5b2ce483 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_51.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_51.py @@ -90,6 +90,7 @@ def init(self): paddle.rand(shape=[1, 4, 64, 64], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False # NOTE prim + cinn lead to error diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_52.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_52.py index ed08605e070d1..e1a3774b1be35 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_52.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_52.py @@ -94,6 +94,7 @@ def init(self): paddle.rand(shape=[91], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_54.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_54.py index cf04f914d15a9..7bdef30c7d243 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_54.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_54.py @@ -117,6 +117,7 @@ def init(self): paddle.rand(shape=[1, 96, 128, 128], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_55.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_55.py index 7d065da0bc99b..9a623a7afa130 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_55.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_55.py @@ -78,6 +78,7 @@ def init(self): paddle.rand(shape=[1, 192, 32, 32], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False # NOTE prim + cinn lead to error diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_56.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_56.py index 79d9a9c15cf9e..4646923191e60 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_56.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_56.py @@ -74,6 +74,7 @@ def init(self): paddle.rand(shape=[24], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False # NOTE prim + cinn lead to error diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_57.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_57.py index a34e30dc687e2..d297a19fa0932 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_57.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_57.py @@ -42,6 +42,7 @@ def init(self): self.input_specs = [] self.inputs = () self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_59.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_59.py index 12dc85dbf3d3f..072c8077b7295 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_59.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_59.py @@ -95,6 +95,7 @@ def init(self): paddle.rand(shape=[1, 44, 32, 32], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False # NOTE prim + cinn lead to error diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_6.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_6.py index f51b3a846151d..89a1c19ed53a7 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_6.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_6.py @@ -47,6 +47,7 @@ def init(self): ] self.inputs = (paddle.rand(shape=[10, 196, 640], dtype=paddle.float32),) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_61.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_61.py index 21332c862ab22..41be02a221bd4 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_61.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_61.py @@ -91,6 +91,7 @@ def init(self): paddle.rand(shape=[1, 4], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False self.atol = 1e-5 diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_62.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_62.py index d4a2234509d1c..dd6069d9f9555 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_62.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_62.py @@ -71,6 +71,7 @@ def init(self): paddle.rand(shape=[1], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False self.atol = 1e-8 diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_63.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_63.py index 5456431c96fea..6a6f430bd82be 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_63.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_63.py @@ -96,6 +96,7 @@ def init(self): paddle.rand(shape=[171888, 4], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False def set_flags(self): # NOTE(Aurelius84): cinn_op.pool2d only support pool_type='avg' under adaptive=True diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_64.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_64.py index 9ec76729c00e0..820f7af48178e 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_64.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_64.py @@ -72,6 +72,7 @@ def init(self): paddle.rand(shape=[512, 256, 7, 7], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_65.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_65.py index 18af525df5c4c..e7e636628d5f1 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_65.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_65.py @@ -55,6 +55,7 @@ def init(self): ] self.inputs = (paddle.rand(shape=[2, 2002], dtype=paddle.float32),) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_66.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_66.py index 1c3d72c455056..033202891b2ed 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_66.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_66.py @@ -64,6 +64,7 @@ def init(self): ] self.inputs = (paddle.rand(shape=[2, 1788], dtype=paddle.float32),) self.net = LayerCase + self.with_train = False # NOTE prim + cinn lead to error diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_67.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_67.py index 75fb8ca7cfb38..74513aac91b5b 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_67.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_67.py @@ -134,6 +134,7 @@ def init(self): paddle.randint(low=0, high=10, shape=[1], dtype=paddle.int32), ) self.net = LayerCase + self.with_train = False self.with_cinn = False # NOTE prim + cinn lead to error diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_68.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_68.py index d3571d898798f..67df4b8fba497 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_68.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_68.py @@ -206,6 +206,7 @@ def init(self): paddle.rand(shape=[528, 4], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_69.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_69.py index c1c4b94929310..4e64e3aea0bbc 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_69.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_69.py @@ -65,6 +65,7 @@ def init(self): paddle.rand(shape=[1, 171888, 4], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False self.with_precision_compare = False diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_7.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_7.py index f4236d7664c59..bdc2d7b052c77 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_7.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_7.py @@ -91,6 +91,7 @@ def init(self): paddle.randint(low=0, high=10, shape=[49, 49], dtype=paddle.int64), ) self.net = LayerCase + self.with_train = False self.with_cinn = False # NOTE prim + cinn lead to error diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_70.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_70.py index 30b04988e601f..a483c47e1e05f 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_70.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_70.py @@ -61,6 +61,7 @@ def init(self): paddle.randint(low=0, high=10, shape=[2], dtype=paddle.int32), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_72.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_72.py index ff048a21337da..489eab05cf04e 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_72.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_72.py @@ -143,6 +143,7 @@ def init(self): self.input_specs = [] self.inputs = () self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_73.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_73.py index ea4a9cd49726d..a75d51a21cd1e 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_73.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_73.py @@ -98,6 +98,7 @@ def init(self): paddle.rand(shape=[2], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False self.with_cinn = False diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_74.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_74.py index a069b9bc3874b..03fcab9ff9f00 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_74.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_74.py @@ -75,6 +75,7 @@ def init(self): paddle.rand(shape=[2], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False self.with_precision_compare = False diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_75.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_75.py index 41204b7c15d2e..a20fbaf33e4e7 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_75.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_75.py @@ -96,6 +96,7 @@ def init(self): paddle.rand(shape=[1, 3, 544, 736], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_77.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_77.py index bb22fb38c693a..4ad52c6aa976c 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_77.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_77.py @@ -209,6 +209,7 @@ def init(self): paddle.rand(shape=[1], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_78.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_78.py index af4320f4609ef..f987f5a334ca6 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_78.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_78.py @@ -125,6 +125,7 @@ def init(self): paddle.rand(shape=[1, 256, 13, 19], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_79.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_79.py index 96d9de9b9c2b6..1bf2af665a2e2 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_79.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_79.py @@ -134,6 +134,7 @@ def init(self): paddle.rand(shape=[1, 3, 96, 96, 1], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_8.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_8.py index 6340bf5a4d451..656e522137b4b 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_8.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_8.py @@ -47,6 +47,7 @@ def init(self): paddle.rand(shape=[22, 128, 14, 14], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_80.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_80.py index 2fe8b3f007e86..4a34d06b5b4af 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_80.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_80.py @@ -125,6 +125,7 @@ def init(self): paddle.rand(shape=[1, 3, 48, 48, 1], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_81.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_81.py index dc0d1e5126259..acbe1eae0ae60 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_81.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_81.py @@ -80,6 +80,7 @@ def init(self): paddle.rand(shape=[1, 80, 44, 44], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False # NOTE prim + cinn lead to error diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_82.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_82.py index 65ab9b68b7b6d..9761629a802e3 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_82.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_82.py @@ -173,6 +173,7 @@ def init(self): paddle.rand(shape=[2541, 2], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False self.with_cinn = False # NOTE cinn lead to error diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_83.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_83.py index 2a1a527317b91..889e5b0e9dfde 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_83.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_83.py @@ -96,6 +96,7 @@ def init(self): self.input_specs = [] self.inputs = () self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_84.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_84.py index 595163ad073e1..a20bac9133a8f 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_84.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_84.py @@ -81,6 +81,7 @@ def init(self): paddle.rand(shape=[1, 2541, 68], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_85.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_85.py index 9ef4bf92bc473..80137072f1c23 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_85.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_85.py @@ -61,6 +61,7 @@ def init(self): paddle.rand(shape=[16384, 5], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_86.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_86.py index 698760309d8ff..47221f58d3ca3 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_86.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_86.py @@ -247,6 +247,7 @@ def init(self): paddle.rand(shape=[1, 2048, 1, 1], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_87.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_87.py index b44fdc4c28783..4e23ab81535de 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_87.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_87.py @@ -201,6 +201,7 @@ def init(self): paddle.rand(shape=[1, 144, 21, 32], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_88.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_88.py index 425537e634f25..0ed66f4e89e8d 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_88.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_88.py @@ -79,6 +79,7 @@ def init(self): paddle.randint(low=0, high=10, shape=[1, 500], dtype=paddle.int32), ) self.net = LayerCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_89.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_89.py index ab1503ef63afa..21faaf7dcad30 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_89.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_89.py @@ -91,6 +91,7 @@ def init(self): paddle.rand(shape=[1, 256, 28, 40], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False # if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_9.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_9.py index e8919aec6e379..7dd68051a5efa 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_9.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_9.py @@ -90,6 +90,7 @@ def init(self): paddle.randint(low=0, high=10, shape=[49, 196], dtype=paddle.int64), ) self.net = LayerCase + self.with_train = False self.with_cinn = False # NOTE prim + cinn lead to error diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_90.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_90.py index e3f28f9775a69..85f937d265d5b 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_90.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_90.py @@ -65,6 +65,7 @@ def init(self): ] self.inputs = (paddle.rand(shape=[12], dtype=paddle.float32),) self.net = LayerCase + self.with_train = False def set_flags(self): # NOTE(Aurelius84): cinn_op.pool2d only support pool_type='avg' under adaptive=True diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_adaptive_avg_pool2d.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_adaptive_avg_pool2d.py index d4d06895c49ae..1a166fad740a7 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_adaptive_avg_pool2d.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_adaptive_avg_pool2d.py @@ -48,6 +48,7 @@ def init(self): paddle.rand(shape=[22, 480, 7, 7], dtype=paddle.float32), ) self.net = AdaptiveAvgPool2dCase + self.with_train = False # NOTE prim + cinn lead to error diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_add.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_add.py index c9cf656ad4a0c..9434d1c189373 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_add.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_add.py @@ -54,6 +54,7 @@ def init(self): paddle.rand(shape=[22, 196, 128], dtype=paddle.float32), ) self.net = AddCase + self.with_train = False self.atol = 1e-8 diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_add_n.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_add_n.py index c488de14d12be..18cf5c72f2a50 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_add_n.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_add_n.py @@ -104,6 +104,7 @@ def init(self): paddle.rand(shape=[1], dtype=paddle.float32), ) self.net = AddNCase + self.with_train = False self.atol = 1e-8 diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_avg_pool2d.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_avg_pool2d.py index 0a40ca5079931..957102539eb07 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_avg_pool2d.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_avg_pool2d.py @@ -56,6 +56,7 @@ def init(self): paddle.rand(shape=[22, 128, 56, 56], dtype=paddle.float32), ) self.net = AvgPool2dCase + self.with_train = False self.atol = 1e-8 self.with_cinn = False diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_chunk.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_chunk.py index 36dae471d0d7d..35e12f767dae7 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_chunk.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_chunk.py @@ -46,6 +46,7 @@ def init(self): paddle.rand(shape=[10, 2304, 192], dtype=paddle.float32), ) self.net = ChunkCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_concat.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_concat.py index f65682e4b0ae9..b298c0870d4bc 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_concat.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_concat.py @@ -54,6 +54,7 @@ def init(self): paddle.rand(shape=[145, 12, 112, 112], dtype=paddle.float32), ) self.net = ConcatCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_conv_nd.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_conv_nd.py index c189750c9f040..5bdd5b1622a34 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_conv_nd.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_conv_nd.py @@ -63,6 +63,7 @@ def init(self): paddle.rand(shape=[22, 64, 56, 56], dtype=paddle.float32), ) self.net = ConvNdCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_linear.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_linear.py index 381eb461b6328..c4a358ad4b0bf 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_linear.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_linear.py @@ -54,6 +54,7 @@ def init(self): ] self.inputs = (paddle.rand(shape=[10, 64], dtype=paddle.float32),) self.net = LinearCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_max_pool2d.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_max_pool2d.py index 5cd643fc5ef4a..96d2bd54868d1 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_max_pool2d.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_max_pool2d.py @@ -55,6 +55,7 @@ def init(self): paddle.rand(shape=[22, 64, 112, 112], dtype=paddle.float32), ) self.net = MaxPool2dCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_mul_method.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_mul_method.py index 1e56b482d3736..fa389063a0513 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_mul_method.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_mul_method.py @@ -54,6 +54,7 @@ def init(self): paddle.rand(shape=[22, 1500, 14, 14], dtype=paddle.float32), ) self.net = LayerCase + self.with_train = False self.atol = 1e-8 diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_relu6.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_relu6.py index f628bc19cc9aa..f267c1610f665 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_relu6.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_relu6.py @@ -47,6 +47,7 @@ def init(self): paddle.rand(shape=[22, 144, 56, 56], dtype=paddle.float32), ) self.net = Relu6Case + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_reshape.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_reshape.py index 5abaff9157d1d..540958310b7cc 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_reshape.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_reshape.py @@ -44,6 +44,7 @@ def init(self): ] self.inputs = (paddle.rand(shape=[4312, 640], dtype=paddle.float32),) self.net = ReshapeCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_sigmoid.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_sigmoid.py index 3f77a5c68a93a..a746f3cdd41bc 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_sigmoid.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_sigmoid.py @@ -46,6 +46,7 @@ def init(self): paddle.rand(shape=[10, 512, 1, 1], dtype=paddle.float32), ) self.net = SigmoidCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_split.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_split.py index b82ec109ca724..57de6d8cb09c0 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_split.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_split.py @@ -48,6 +48,7 @@ def init(self): paddle.rand(shape=[11, 976, 7, 7], dtype=paddle.float32), ) self.net = SplitCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_squeeze_unsqueeze.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_squeeze_unsqueeze.py index 516d6c6735ff6..4f7438c8a00eb 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_squeeze_unsqueeze.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_squeeze_unsqueeze.py @@ -51,6 +51,7 @@ def init(self): ] self.inputs = (paddle.rand(shape=[1, 12, 1, 64], dtype=paddle.float32),) self.net = SqueezeCase + self.with_train = False self.atol = 1e-8 @@ -66,6 +67,7 @@ def init(self): ] self.inputs = (paddle.rand(shape=[1, 12, 1, 64], dtype=paddle.float32),) self.net = UnsqueezeCase + self.with_train = False self.atol = 1e-8 diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_swish.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_swish.py index 1f7402d0470ed..da572f47bfd94 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_swish.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_swish.py @@ -46,6 +46,7 @@ def init(self): paddle.rand(shape=[43, 32, 112, 112], dtype=paddle.float32), ) self.net = SwishCase + self.with_train = False self.atol = 1e-8 diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_transpose.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_transpose.py index 49a05607e3ae3..51db880532187 100644 --- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_transpose.py +++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_transpose.py @@ -46,6 +46,7 @@ def init(self): paddle.rand(shape=[22, 4, 224, 224], dtype=paddle.float32), ) self.net = TransposeCase + self.with_train = False if __name__ == '__main__': diff --git a/test/ir/pir/cinn/symbolic/test_dyshape_group_norm.py b/test/ir/pir/cinn/symbolic/test_dyshape_group_norm.py new file mode 100644 index 0000000000000..a3e9b838eeae4 --- /dev/null +++ b/test/ir/pir/cinn/symbolic/test_dyshape_group_norm.py @@ -0,0 +1,91 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import unittest +from os.path import dirname + +import numpy as np + +import paddle +from paddle import nn +from paddle.static import InputSpec + +sys.path.append(dirname(dirname(__file__))) + +import utils + + +class GroupNorm(nn.Layer): + def __init__(self): + super().__init__() + self.hidden_size = 768 + self.dtype = "float32" + self.weight = paddle.randn([128], dtype=self.dtype) + self.weight.stop_gradient = False + self.bias = paddle.randn([128], dtype=self.dtype) + self.bias.stop_gradient = False + + self.data_format = "NHWC" + + def forward(self, x): + return paddle.nn.functional.group_norm( + x, + num_groups=32, + epsilon=1e-6, + weight=self.weight, + bias=self.bias, + data_format=self.data_format, + ) + + +class TestGroupNorm(unittest.TestCase): + def setUp(self): + paddle.seed(2024) + self.shape = [1, 128, 256, 128] + self.dtype = "float32" + self.data_format = "NHWC" + self.prepare_data() + + def prepare_data(self): + self.x = paddle.randn(self.shape, dtype=self.dtype) + self.x.stop_gradient = False + + def check_jit_kernel_info(self, static_fn): + utils.check_jit_kernel_number(static_fn, 2) + utils.check_jit_kernel_structure(static_fn, {utils.JIT_KERNEL_NAME: 2}) + + def eval(self, use_cinn): + paddle.seed(2024) + net = GroupNorm() + input_spec = [ + InputSpec(shape=[None, None, None, 128], dtype='float32'), + ] + net = utils.apply_to_static(net, use_cinn, input_spec) + net.eval() + out = net(self.x) + if use_cinn: + self.check_jit_kernel_info(net.forward) + return out + + def test_eval(self): + cinn_out = self.eval(use_cinn=True) + dy_out = self.eval(use_cinn=False) + np.testing.assert_allclose( + cinn_out.numpy(), dy_out.numpy(), atol=1e-6, rtol=1e-6 + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/ir/pir/fused_pass/onednn/test_placement_pass_mean_op.py b/test/ir/pir/fused_pass/onednn/test_placement_pass_mean_op.py new file mode 100644 index 0000000000000..6443a60c331f9 --- /dev/null +++ b/test/ir/pir/fused_pass/onednn/test_placement_pass_mean_op.py @@ -0,0 +1,60 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest + +import numpy as np +from pass_test import PassTest + +import paddle + +paddle.enable_static() + + +class TestMeanPlacementPass(PassTest): + def is_program_valid(self, program=None): + return True + + def build_ir_program(self): + with paddle.pir_utils.IrGuard(): + main_prog = paddle.static.Program() + start_prog = paddle.static.Program() + with paddle.pir.core.program_guard(main_prog, start_prog): + x = paddle.static.data( + name='x', shape=[5, 2, 5, 5], dtype='float32' + ) + mean = paddle.mean(x) + out = paddle.assign(mean) + self.pass_attr_list = [{'onednn_placement_pass': {}}] + + self.feeds = { + "x": np.random.random((5, 2, 5, 5)).astype("float32"), + } + self.fetch_list = [out] + self.valid_op_map = { + "onednn_op.mean": 1, + } + return [main_prog, start_prog] + + def sample_program(self): + yield self.build_ir_program(), False + + def setUp(self): + self.places.append(paddle.CPUPlace()) + + def test_check_output(self): + self.check_pass_correct() + + +if __name__ == "__main__": + unittest.main() diff --git a/test/ir/pir/fused_pass/pass_test.py b/test/ir/pir/fused_pass/pass_test.py index 3bb937ec59771..c5066bad6b34f 100644 --- a/test/ir/pir/fused_pass/pass_test.py +++ b/test/ir/pir/fused_pass/pass_test.py @@ -69,7 +69,7 @@ def run_program(self, executor, startup_program, main_program): fetches = executor.run( main_program, feed=self.feeds, - fetch_list=self.fetch_list, + fetch_list=main_program.list_vars()[-1], ) return fetches diff --git a/test/ir/pir/fused_pass/test_add_norm_fuse_pass.py b/test/ir/pir/fused_pass/test_add_norm_fuse_pass.py index fac6e62bc2278..2af09ed475b33 100644 --- a/test/ir/pir/fused_pass/test_add_norm_fuse_pass.py +++ b/test/ir/pir/fused_pass/test_add_norm_fuse_pass.py @@ -21,8 +21,6 @@ from paddle.base import core from paddle.pir.core import create_parameter -paddle.enable_static() - class TestRmsNormFusePattern(PassTest): r""" @@ -284,7 +282,7 @@ class TestAddLayerNormFusePattern(TestRmsNormFusePattern): def sample_program(self): for x_shape in [[1, 1, 4096]]: for w_shape in [[4096]]: - for w_type in ['float32']: + for x_type in ['float32', 'float16']: for epilson in [1e-6]: with paddle.pir_utils.IrGuard(): start_prog = paddle.static.Program() @@ -295,10 +293,10 @@ def sample_program(self): residual = paddle.static.data( name='residual', shape=x_shape, - dtype='float32', + dtype=x_type, ) x = paddle.static.data( - name='x', shape=x_shape, dtype='float32' + name='x', shape=x_shape, dtype=x_type ) w_attr = paddle.ParamAttr( learning_rate=0.0, @@ -306,13 +304,19 @@ def sample_program(self): mean=0.0, std=2.0 ), ) + b_attr = paddle.ParamAttr( + learning_rate=0.0, + initializer=paddle.nn.initializer.Normal( + mean=0.0, std=2.0 + ), + ) w1 = create_parameter( name="w1", shape=w_shape, - dtype=w_type, + dtype=x_type, initializer=paddle.nn.initializer.Assign( np.random.random([4096, 4096]).astype( - w_type + x_type ) ), ) @@ -322,6 +326,7 @@ def sample_program(self): add_out.shape[-1:], epsilon=epilson, weight_attr=w_attr, + bias_attr=b_attr, ) layer_norm_out = layer_norm(add_out) matmul_out = paddle.matmul(layer_norm_out, w1) @@ -332,11 +337,11 @@ def sample_program(self): ] self.feeds = { "x": np.random.random(x_shape).astype( - "float32" + x_type ), "residual": np.random.random( x_shape - ).astype("float32"), + ).astype(x_type), } self.fetch_list = [out] self.valid_op_map = { @@ -350,5 +355,290 @@ def test_check_output(self): self.check_pass_correct(atol=1e-3, rtol=1e-3) +class TestAddGroupNormPattern_FP16(PassTest): + r""" + x residual + | | + add + | + group_norm + """ + + def is_program_valid(self, program=None): + return True + + def sample_program(self): + for x_shape in [[2, 6, 4, 2]]: + for residual_shape in [[1, 6, 1, 1]]: + for dtype in ['float16']: + for epilson in [1e-5]: + for groups in [2]: + for data_layout in ['NCHW']: + rand_value = ( + 0.001 + * paddle.rand( + shape=[x_shape[1]], dtype=dtype + ).numpy() + ) + with paddle.pir_utils.IrGuard(): + start_prog = paddle.static.Program() + main_prog = paddle.static.Program() + with paddle.pir.core.program_guard( + main_prog, start_prog + ): + residual = paddle.static.data( + name='residual', + shape=residual_shape, + dtype=dtype, + ) + x = paddle.static.data( + name='x', shape=x_shape, dtype=dtype + ) + w = create_parameter( + shape=[x_shape[1]], + dtype=dtype, + initializer=paddle.nn.initializer.Assign( + rand_value + ), + ) + b = create_parameter( + shape=[residual_shape[1]], + dtype=dtype, + initializer=paddle.nn.initializer.Assign( + rand_value + ), + ) + add_out = paddle.add(x, residual) + + group_norm_out = ( + paddle.nn.functional.group_norm( + add_out, + num_groups=groups, + epsilon=epilson, + weight=w, + bias=b, + data_format=data_layout, + ) + ) + out = paddle.assign(group_norm_out) + self.pass_attr_list = [ + {'add_norm_fuse_pass': {}}, + {'transfer_layout_pass': {}}, + { + 'remove_redundant_transpose_pass': {} + }, + ] + self.feeds = { + "x": np.random.random( + x_shape + ).astype(dtype), + "residual": np.random.random( + residual_shape + ).astype(dtype), + } + self.fetch_list = [out] + self.valid_op_map = { + "pa_op.add": 0, + "pd_op.group_norm": 0, + "pd_op.add_group_norm_silu": 1, + } + yield [main_prog, start_prog], False + + def setUp(self): + if core.is_compiled_with_cuda(): + self.places.append(paddle.CUDAPlace(0)) + + def test_check_output(self): + self.check_pass_correct() + + +class TestAddGroupNormPatternSilu_FP16(PassTest): + r""" + x residual + | | + add + | + group_norm + """ + + def is_program_valid(self, program=None): + return True + + def sample_program(self): + for x_shape in [[2, 6, 4, 2]]: + for residual_shape in [[1, 6, 1, 1]]: + for dtype in ['float16']: + for epilson in [1e-5]: + for groups in [2]: + for data_layout in ['NCHW']: + rand_value = ( + 0.001 + * paddle.rand( + shape=[x_shape[1]], dtype=dtype + ).numpy() + ) + with paddle.pir_utils.IrGuard(): + start_prog = paddle.static.Program() + main_prog = paddle.static.Program() + with paddle.pir.core.program_guard( + main_prog, start_prog + ): + residual = paddle.static.data( + name='residual', + shape=residual_shape, + dtype=dtype, + ) + x = paddle.static.data( + name='x', shape=x_shape, dtype=dtype + ) + w = create_parameter( + shape=[x_shape[1]], + dtype=dtype, + initializer=paddle.nn.initializer.Assign( + rand_value + ), + ) + b = create_parameter( + shape=[x_shape[1]], + dtype=dtype, + initializer=paddle.nn.initializer.Assign( + rand_value + ), + ) + add_out = paddle.add(x, residual) + group_norm_out = ( + paddle.nn.functional.group_norm( + add_out, + num_groups=groups, + epsilon=epilson, + weight=w, + bias=b, + data_format=data_layout, + ) + ) + out = paddle.nn.functional.silu( + group_norm_out + ) + out = paddle.assign(out) + self.pass_attr_list = [ + {'add_norm_fuse_pass': {}}, + {'transfer_layout_pass': {}}, + { + 'remove_redundant_transpose_pass': {} + }, + ] + self.feeds = { + "x": np.random.random( + x_shape + ).astype(dtype), + "residual": np.random.random( + residual_shape + ).astype(dtype), + } + self.fetch_list = [out] + self.valid_op_map = { + "pd_op.silu": 0, + "pd_op.add": 0, + "pd_op.group_norm": 0, + "pd_op.add_group_norm_silu": 1, + } + yield [main_prog, start_prog], False + + def setUp(self): + if core.is_compiled_with_cuda(): + self.places.append(paddle.CUDAPlace(0)) + + def test_check_output(self): + self.check_pass_correct() + + +class GroupNormSiluPattern_FP16(PassTest): + r""" + group_norm + | + silu + """ + + def is_program_valid(self, program=None): + return True + + def sample_program(self): + for x_shape in [[2, 6, 4, 2]]: + for residual_shape in [[1, 6, 1, 1]]: + for dtype in ['float16']: + for epilson in [1e-5]: + for groups in [2]: + for data_layout in ['NCHW']: + rand_value = ( + 0.001 + * paddle.rand( + shape=[x_shape[1]], dtype=dtype + ).numpy() + ) + with paddle.pir_utils.IrGuard(): + start_prog = paddle.static.Program() + main_prog = paddle.static.Program() + with paddle.pir.core.program_guard( + main_prog, start_prog + ): + x = paddle.static.data( + name='x', shape=x_shape, dtype=dtype + ) + w = create_parameter( + shape=[x_shape[1]], + dtype=dtype, + initializer=paddle.nn.initializer.Assign( + rand_value + ), + ) + b = create_parameter( + shape=[x_shape[1]], + dtype=dtype, + initializer=paddle.nn.initializer.Assign( + rand_value + ), + ) + group_norm_out = ( + paddle.nn.functional.group_norm( + x, + num_groups=groups, + epsilon=epilson, + weight=w, + bias=b, + data_format=data_layout, + ) + ) + out = paddle.nn.functional.silu( + group_norm_out + ) + out = paddle.assign(out) + self.pass_attr_list = [ + {'add_norm_fuse_pass': {}}, + {'transfer_layout_pass': {}}, + { + 'remove_redundant_transpose_pass': {} + }, + ] + self.feeds = { + "x": np.random.random( + x_shape + ).astype(dtype), + } + self.fetch_list = [out] + self.valid_op_map = { + "pd_op.silu": 0, + "pd_op.group_norm": 0, + "pd_op.add_group_norm_silu": 1, + } + yield [main_prog, start_prog], False + + def setUp(self): + if core.is_compiled_with_cuda(): + self.places.append(paddle.CUDAPlace(0)) + + def test_check_output(self): + self.check_pass_correct() + + if __name__ == "__main__": unittest.main() diff --git a/test/ir/test_ir_fusion_group_pass.py b/test/ir/test_ir_fusion_group_pass.py index 0637efb067f7e..56c723613e939 100644 --- a/test/ir/test_ir_fusion_group_pass.py +++ b/test/ir/test_ir_fusion_group_pass.py @@ -72,7 +72,7 @@ def _feed_random_data(self, feed_vars): elif var.dtype == paddle.float16: dtype = "float16" else: - raise ValueError("Unsupported dtype %s" % var.dtype) + raise ValueError(f"Unsupported dtype {var.dtype}") feeds[var.name] = np.random.random(shape).astype(dtype) return feeds diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt index 8c4cfe9113ab3..f84458dd494f3 100644 --- a/test/legacy_test/CMakeLists.txt +++ b/test/legacy_test/CMakeLists.txt @@ -155,6 +155,7 @@ if(WIN32) list(REMOVE_ITEM TEST_OPS test_fused_layernorm_op) list(REMOVE_ITEM TEST_OPS test_matmul_int8_op) list(REMOVE_ITEM TEST_OPS test_variable_length_memory_efficient_attention) + list(REMOVE_ITEM TEST_OPS test_ops_nms) endif() list(REMOVE_ITEM TEST_OPS test_checkpoint_saver) @@ -425,10 +426,6 @@ list(REMOVE_ITEM TEST_OPS test_warpctc_op) list(REMOVE_ITEM TEST_OPS test_imperative_resnet) list(REMOVE_ITEM TEST_OPS test_imperative_resnet_sorted_gradient) list(REMOVE_ITEM TEST_OPS test_imperative_se_resnext) -list(REMOVE_ITEM TEST_OPS test_parallel_executor_seresnext_base_cpu) -list(REMOVE_ITEM TEST_OPS test_parallel_executor_seresnext_with_reduce_cpu) -list(REMOVE_ITEM TEST_OPS - test_parallel_executor_seresnext_with_fuse_all_reduce_cpu) list(REMOVE_ITEM TEST_OPS test_async_ssa_graph_executor_mnist) list(REMOVE_ITEM TEST_OPS test_basic_gru_api) list(REMOVE_ITEM TEST_OPS test_basic_gru_unit_op) @@ -437,6 +434,8 @@ list(REMOVE_ITEM TEST_OPS test_basic_lstm_unit_op) list(REMOVE_ITEM TEST_OPS test_fuse_bn_add_act_pass) list(REMOVE_ITEM TEST_OPS test_conv3d_transpose_op) list(REMOVE_ITEM TEST_OPS test_layers) +list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op) +list(REMOVE_ITEM TEST_OPS test_nearest_interp_op) # disable this unittest temporarily list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception) @@ -485,6 +484,8 @@ endif() # Some ops need to check results when gc is enabled # Currently, only ops that register NoNeedBufferVarsInference need to do this test set(TEST_OPS_WITH_GC + test_affine_channel_op + test_scatter_op test_concat_op test_elementwise_add_op test_lookup_table_op @@ -571,6 +572,11 @@ if((WITH_GPU) AND (WITH_CUDNN_FRONTEND)) test_fused_dot_product_attention_op) endif() +py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op ENVS + ${GC_ENVS}) +py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op ENVS + ${GC_ENVS}) + set_tests_properties(test_conv2d_op PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE") set_tests_properties(test_norm_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE") set_tests_properties(test_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE") @@ -753,6 +759,7 @@ if(WITH_DISTRIBUTE) endif() # setting timeout value as 15S +set_tests_properties(test_isin PROPERTIES TIMEOUT 30) set_tests_properties(test_binomial_op PROPERTIES TIMEOUT 30) set_tests_properties(test_run PROPERTIES TIMEOUT 120) set_tests_properties(test_sync_batch_norm_op PROPERTIES TIMEOUT 180) @@ -788,12 +795,14 @@ if(WITH_NV_JETSON) set_tests_properties(test_norm_op PROPERTIES TIMEOUT 1200) set_tests_properties(test_batch_norm_op_prim_nchw PROPERTIES TIMEOUT 1500) set_tests_properties(test_batch_norm_op_prim_nhwc PROPERTIES TIMEOUT 1500) + set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 1500) else() set_tests_properties(test_concat_op PROPERTIES TIMEOUT 400) set_tests_properties(test_conv3d_op PROPERTIES TIMEOUT 120) set_tests_properties(test_norm_op PROPERTIES TIMEOUT 150) set_tests_properties(test_batch_norm_op_prim_nchw PROPERTIES TIMEOUT 250) set_tests_properties(test_batch_norm_op_prim_nhwc PROPERTIES TIMEOUT 250) + set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 150) endif() if((WITH_GPU OR WITH_ROCM) AND (LINUX)) py_test_modules(test_conv3d_transpose_op MODULES test_conv3d_transpose_op @@ -946,6 +955,9 @@ if(WITH_CUDNN_FRONTEND) endif() set(TEST_CINN_OPS + test_assign_op + test_atan2_op + test_cast_op test_stack_op test_activation_op test_fill_any_like_op @@ -954,6 +966,22 @@ set(TEST_CINN_OPS test_elementwise_sub_op test_elementwise_div_op test_elementwise_max_op + test_elementwise_mul_op + test_elementwise_pow_op + test_expand_v2_op + test_flatten_contiguous_range_op + test_flip + test_full_like_op + test_top_k_op + test_top_k_v2_op + test_reshape_op + test_triangular_solve_op + test_split_op + test_scatter_op + test_reverse_op + test_roll_op + test_meshgrid_op + test_index_select_op test_mean_op test_clip_op test_gather_op @@ -997,6 +1025,10 @@ set_tests_properties( # These UTs are to temporarily test static build for standalone_executor, will be removed after static build is enabled by default. set(STATIC_BUILD_TESTS test_adagrad_op + test_eigh_op + test_matmul_op + test_matmul_v2_op + test_paddle_save_load_binary test_assign_pos_op test_bucketize_api test_c_embedding_op @@ -1099,3 +1131,45 @@ set_pir_tests_properties() set_tests_properties(test_nadam_op PROPERTIES TIMEOUT 100) set_tests_properties(test_radam_op PROPERTIES TIMEOUT 100) set_tests_properties(test_nan_inf PROPERTIES TIMEOUT 120) +set_tests_properties(test_bicubic_interp_op PROPERTIES TIMEOUT 120) +set_tests_properties(test_bilinear_interp_op PROPERTIES TIMEOUT 120) +set_tests_properties(test_conv2d_op_depthwise_conv + PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE") +set_tests_properties(test_conv2d_op_depthwise_conv PROPERTIES TIMEOUT 120) +set_tests_properties(test_crop_tensor_op PROPERTIES TIMEOUT 120) +set_tests_properties(test_cross_entropy2_op PROPERTIES TIMEOUT 120) +set_tests_properties(test_cumprod_op PROPERTIES TIMEOUT 300) +set_tests_properties(test_deformable_conv_v1_op PROPERTIES TIMEOUT 300) +set_tests_properties(test_elementwise_mul_op PROPERTIES TIMEOUT 120) +set_tests_properties(test_fractional_max_pool2d_op PROPERTIES TIMEOUT 120) +set_tests_properties(test_graph_send_ue_recv_op PROPERTIES TIMEOUT 60) +set_tests_properties(test_graph_send_uv_op PROPERTIES TIMEOUT 60) +set_tests_properties(test_gru_op PROPERTIES TIMEOUT 200) +set_tests_properties(test_gru_unit_op PROPERTIES TIMEOUT 120) +set_tests_properties(test_imperative_star_gan_with_gradient_penalty + PROPERTIES TIMEOUT 120) +set_tests_properties(test_index_add_op PROPERTIES TIMEOUT 120) +set_tests_properties(test_index_select_op PROPERTIES TIMEOUT 120) +set_tests_properties(test_lstm_op PROPERTIES TIMEOUT 120) +set_tests_properties(test_matmul_op PROPERTIES TIMEOUT 120) +set_tests_properties(test_matmul_v2_op PROPERTIES TIMEOUT 120) +set_tests_properties(test_nearest_interp_op PROPERTIES TIMEOUT 120) +set_tests_properties(test_nearest_interp_v2_op PROPERTIES TIMEOUT 120) +set_tests_properties(test_pad3d_op PROPERTIES TIMEOUT 120) +set_tests_properties(test_paddle_save_load_binary PROPERTIES TIMEOUT 120) +set_tests_properties(test_partial_concat_op PROPERTIES TIMEOUT 120) +set_tests_properties(test_partial_sum_op PROPERTIES TIMEOUT 120) +set_tests_properties(test_qr_op PROPERTIES TIMEOUT 60) +set_tests_properties(test_row_conv_op PROPERTIES TIMEOUT 120) +if(WIN32) + set_tests_properties(test_static_save_load_large PROPERTIES TIMEOUT 900) +else() + set_tests_properties(test_static_save_load_large PROPERTIES TIMEOUT 600) +endif() +set_tests_properties(test_svd_op PROPERTIES TIMEOUT 80) +set_tests_properties(test_trilinear_interp_op PROPERTIES TIMEOUT 120) +set_tests_properties(test_trilinear_interp_v2_op PROPERTIES TIMEOUT 120) +set_tests_properties(test_matmul_op_static_build PROPERTIES TIMEOUT 120) +set_tests_properties(test_matmul_v2_op_static_build PROPERTIES TIMEOUT 120) +set_tests_properties(test_paddle_save_load_binary_static_build + PROPERTIES TIMEOUT 120) diff --git a/test/legacy_test/dist_ctr_reader.py b/test/legacy_test/dist_ctr_reader.py index 23f4daf2a5d8f..039d2c8aaf178 100644 --- a/test/legacy_test/dist_ctr_reader.py +++ b/test/legacy_test/dist_ctr_reader.py @@ -114,7 +114,7 @@ def train(self): Load trainset. ''' file_name = "train.txt" - logger.info("load trainset from %s" % file_name) + logger.info(f"load trainset from {file_name}") mode = TaskMode.create_train() return self._parse_creator(file_name, mode) @@ -123,7 +123,7 @@ def test(self): Load testset. ''' file_name = "test.txt" - logger.info("load testset from %s" % file_name) + logger.info(f"load testset from {file_name}") mode = TaskMode.create_test() return self._parse_creator(file_name, mode) @@ -132,7 +132,7 @@ def infer(self): Load infer set. ''' file_name = "infer.txt" - logger.info("load inferset from %s" % file_name) + logger.info(f"load inferset from {file_name}") mode = TaskMode.create_infer() return self._parse_creator(file_name, mode) diff --git a/test/legacy_test/gradient_checker.py b/test/legacy_test/gradient_checker.py index 210db283b979a..41c668043e3f8 100644 --- a/test/legacy_test/gradient_checker.py +++ b/test/legacy_test/gradient_checker.py @@ -324,7 +324,7 @@ def _compute_analytical_jacobian_pir( filted_idx, filted_dx = zip(*filted) # get the name in feeds of dyi - name = 'dys_%s' % i + name = f'dys_{i}' np_t = np.array(feeds[name]).astype(np_type) shape = np_t.shape np_t = np_t.flatten() @@ -392,7 +392,7 @@ def fail_test(msg): if in_pir_mode(): analytical = [] for i in range(len(y)): - name = 'dys_%s' % i + name = f'dys_{i}' feeds.update( { name: np.zeros( @@ -780,7 +780,7 @@ def get_pir_static_double_grad( yi.persistable = True np_type = dtype_to_np_dtype(yi.dtype) dy = paddle.static.data( - name='Dgrad_%s' % i, + name=f'Dgrad_{i}', shape=yi.shape, dtype=np_type, ) @@ -797,7 +797,7 @@ def get_pir_static_double_grad( yi.persistable = True np_type = dtype_to_np_dtype(yi.dtype) dy = paddle.static.data( - name='Dgrad_%s' % i, + name=f'Dgrad_{i}', shape=yi.shape, dtype=np_type, ) @@ -851,12 +851,12 @@ def get_pir_static_double_grad( yi = y[i] np_type = dtype_to_np_dtype(yi.dtype) dy = paddle.static.data( - name='dys_%s' % i, + name=f'dys_{i}', shape=yi.shape, dtype=np_type, ) value = np.ones(yi.shape, dtype=np_type) - feeds.update({'dys_%s' % i: value}) + feeds.update({f'dys_{i}': value}) dys.append(dy) # append second order backward @@ -1130,7 +1130,7 @@ def get_pir_static_triple_grad( yi.persistable = True np_type = dtype_to_np_dtype(yi.dtype) dy = paddle.static.data( - name='Tgrad_%s' % i, + name=f'Tgrad_{i}', shape=yi.shape, dtype=np_type, ) @@ -1147,7 +1147,7 @@ def get_pir_static_triple_grad( yi.persistable = True np_type = dtype_to_np_dtype(yi.dtype) dy = paddle.static.data( - name='Tgrad_%s' % i, + name=f'Tgrad_{i}', shape=yi.shape, dtype=np_type, ) diff --git a/test/legacy_test/op.py b/test/legacy_test/op.py index 0dec2f001188e..e60a0e63ae8dd 100644 --- a/test/legacy_test/op.py +++ b/test/legacy_test/op.py @@ -163,7 +163,7 @@ def __call__(self, *args, **kwargs): new_attr.scalars.MergeFrom(item) else: raise NotImplementedError( - "A not supported attribute type: %s." % (str(attr.type)) + f"A not supported attribute type: {str(attr.type)}." ) for attr_name, defalut_val in self.__extra_attrs__.items(): user_defined_attr = kwargs.get(attr_name, None) @@ -212,7 +212,7 @@ def __call__(self, *args, **kwargs): new_attr.scalars.MergeFrom(item) else: raise NotImplementedError( - "A not supported attribute type: %s." % (str(attr_type)) + f"A not supported attribute type: {str(attr_type)}." ) return op_desc @@ -292,7 +292,7 @@ def types(self): def get_op_info(self, t): if t not in self.op_methods: - raise ValueError("The operator: %s is not registered." % t) + raise ValueError(f"The operator: {t} is not registered.") return self.op_methods.get(t) def get_op_input_names(self, type): diff --git a/test/legacy_test/op_test.py b/test/legacy_test/op_test.py index ed4e0f478ed38..eec710f01cf8e 100644 --- a/test/legacy_test/op_test.py +++ b/test/legacy_test/op_test.py @@ -114,7 +114,7 @@ def check_out_dtype(api_fn, in_specs, expect_dtypes, target_index=0, **configs): ) input_t.append( paddle.static.data( - name='data_%s' % index, shape=shape, dtype=dtype + name=f'data_{index}', shape=shape, dtype=dtype ) ) @@ -223,7 +223,7 @@ def __get_elem__(tensor, i): return tensor._get_complex128_element(i) else: raise TypeError( - "Unsupported test data type %s." % tensor_to_check_dtype + f"Unsupported test data type {tensor_to_check_dtype}." ) def __set_elem__(tensor, i, e): @@ -251,7 +251,7 @@ def __set_elem__(tensor, i, e): return tensor._set_complex128_element(i, e) else: raise TypeError( - "Unsupported test data type %s." % tensor_to_check_dtype + f"Unsupported test data type {tensor_to_check_dtype}." ) # we only compute gradient of one element each time. @@ -501,7 +501,7 @@ def is_complex_test(): and not hasattr(cls, "exist_check_grad") ): raise AssertionError( - "This test of %s op needs check_grad." % cls.op_type + f"This test of {cls.op_type} op needs check_grad." ) # check for op test with fp64 precision, but not check onednn op test for now @@ -518,8 +518,7 @@ def is_complex_test(): and not cls.check_prim_pir ): raise AssertionError( - "This test of %s op needs check_grad with fp64 precision." - % cls.op_type + f"This test of {cls.op_type} op needs check_grad with fp64 precision." ) if ( @@ -1061,7 +1060,7 @@ def create_var( name_temp = name else: nplist_value_temp = np_list[name] - name_temp = unique_name.generate("%s_out" % (name)) + name_temp = unique_name.generate(f"{name}_out") v = create_var( nplist_value_temp, name_temp, @@ -1184,10 +1183,9 @@ def cal_python_api(python_api, args, kernel_sig): return None if not hasattr(self, "python_api"): print(kernel_sig) - assert hasattr(self, "python_api"), ( - "Detect there is KernelSignature for `%s` op, please set the `self.python_api` if you set check_dygraph = True" - % self.op_type - ) + assert hasattr( + self, "python_api" + ), f"Detect there is KernelSignature for `{self.op_type}` op, please set the `self.python_api` if you set check_dygraph = True" args = OpTestUtils.prepare_python_api_arguments( self.python_api, dygraph_tensor_inputs, @@ -1288,10 +1286,9 @@ def get_kernel_signature(self, place, egr_inps=None, egr_oups=None): return None if not hasattr(self, "python_api"): print(kernel_sig) - assert hasattr(self, "python_api"), ( - "Detect there is KernelSignature for `%s` op, please set the `self.python_api` if you set check_dygraph = True" - % self.op_type - ) + assert hasattr( + self, "python_api" + ), f"Detect there is KernelSignature for `{self.op_type}` op, please set the `self.python_api` if you set check_dygraph = True" return kernel_sig def get_ir_input_attr_dict_and_feed(self, stop_gradient): @@ -2573,7 +2570,7 @@ def _is_skip_name(self, name): not in no_check_set_white_list.no_check_set_white_list ): raise AssertionError( - "no_check_set of op %s must be set to None." % self.op_type + f"no_check_set of op {self.op_type} must be set to None." ) if check_prim: @@ -3091,7 +3088,7 @@ def check_grad_with_place_for_static( analytic_grads, inputs_to_check, max_relative_error, - "Gradient Check On %s" % str(place), + f"Gradient Check On {str(place)}", atol=atol, ) @@ -3366,7 +3363,7 @@ def check_grad_with_place( dygraph_dygraph_grad, inputs_to_check, max_relative_error, - "Gradient Check On %s" % str(place), + f"Gradient Check On {str(place)}", atol=atol, ) @@ -3406,7 +3403,7 @@ def check_grad_with_place( pir_grad, inputs_to_check, max_relative_error, - "Gradient Check On %s" % str(place), + f"Gradient Check On {str(place)}", atol=atol, ) @@ -3484,7 +3481,7 @@ def _get_dygraph_grad( ) else: raise TypeError( - "Unsupported test data type %s." % type(cast_input) + f"Unsupported test data type {type(cast_input)}." ) outputs = {} @@ -3850,12 +3847,12 @@ def construct_output_dict_by_kernel_sig(ret_tuple, output_sig): range(len(user_defined_grad_outputs)), ): grad_val = paddle.static.data( - name='val_grad_%s' % idx, + name=f'val_grad_{idx}', shape=grad_out_value.shape, dtype=grad_out_value.dtype, ) grad_outputs.append(grad_val) - feed.update({'val_grad_%s' % idx: grad_out_value}) + feed.update({f'val_grad_{idx}': grad_out_value}) # delete the inputs which no need to calculate grad for no_grad_val in no_grad_set: del static_inputs[no_grad_val] @@ -3894,8 +3891,7 @@ def construct_output_dict_by_kernel_sig(ret_tuple, output_sig): ) else: raise TypeError( - "Unsupported test data type %s." - % type(cast_input) + f"Unsupported test data type {type(cast_input)}." ) outputs = {} diff --git a/test/legacy_test/prim_op_test.py b/test/legacy_test/prim_op_test.py index 6894d37a2839a..c059499f43e16 100644 --- a/test/legacy_test/prim_op_test.py +++ b/test/legacy_test/prim_op_test.py @@ -100,8 +100,7 @@ def _get_kernel_signature( """we think the kernel_sig is missing.""" kernel_sig = None print( - "[Warning: op_test.py] Kernel Signature is not found for %s, fall back to intermediate state." - % op_type + f"[Warning: op_test.py] Kernel Signature is not found for {op_type}, fall back to intermediate state." ) return kernel_sig @@ -677,9 +676,9 @@ def check_static_comp(self): # ensure the operator not in program if check_prim is True if not in_pir_mode(): forward_ops = [op.type for op in main_program.blocks[0].ops] - assert self.op_type not in forward_ops, ( - "%s shouldn't appear in program when check_prim is True" - ) % (self.op_type) + assert ( + self.op_type not in forward_ops + ), f"{self.op_type} shouldn't appear in program when check_prim is True" exe = paddle.static.Executor(self.place) exe.run(startup_program) ret = exe.run(main_program, feed=feed, fetch_list=ret) @@ -761,9 +760,9 @@ def check_jit_comp(self): .forward_program.block(0) .ops ] - assert self.op_type not in forward_ops, ( - "%s shouldn't appear in program when check_prim is True" - ) % (self.op_type) + assert ( + self.op_type not in forward_ops + ), f"{self.op_type} shouldn't appear in program when check_prim is True" ret = flatten(_as_list(net(args))) ret = paddle.utils.map_structure(lambda x: x.numpy(), ret) if OpTestUtils.is_bfloat16_type(self.dtype): @@ -854,9 +853,9 @@ def check_jit_comp_with_cinn(self): .forward_program.block(0) .ops ] - assert self.op_type not in forward_ops, ( - "%s shouldn't appear in program when check_prim is True" - ) % (self.op_type) + assert ( + self.op_type not in forward_ops + ), f"{self.op_type} shouldn't appear in program when check_prim is True" ret = flatten(_as_list(net(args))) ret = paddle.utils.map_structure(lambda x: x.numpy(), ret) if OpTestUtils.is_bfloat16_type(self.dtype): @@ -1160,9 +1159,9 @@ def check_static_comp(self): if not in_pir_mode(): ops = [op.type for op in main_program.blocks[0].ops] backward_op_type = self.op_type + "_grad" - assert backward_op_type not in ops, ( - "%s shouldn't appear in program when check_prim is True" - ) % (backward_op_type) + assert ( + backward_op_type not in ops + ), f"{backward_op_type} shouldn't appear in program when check_prim is True" elif self.prim_op_type == "prim": grad_ops = [] for op in main_program.global_block().ops: @@ -1261,9 +1260,9 @@ def check_jit_comp(self): .ops ] backward_op_type = self.op_type + "_grad" - assert backward_op_type not in ops, ( - "%s shouldn't appear in program when check_prim is True" - ) % (backward_op_type) + assert ( + backward_op_type not in ops + ), f"{backward_op_type} shouldn't appear in program when check_prim is True" out = _as_list(net(args)) if hasattr(self.op_test, "python_out_sig"): outputs_sig = self.op_test.python_out_sig @@ -1387,9 +1386,9 @@ def check_jit_comp_with_cinn(self): .ops ] backward_op_type = self.op_type + "_grad" - assert backward_op_type not in ops, ( - "%s shouldn't appear in program when check_prim is True" - ) % (backward_op_type) + assert ( + backward_op_type not in ops + ), f"{backward_op_type} shouldn't appear in program when check_prim is True" out = _as_list(net(args)) if hasattr(self.op_test, "python_out_sig"): diff --git a/test/legacy_test/test_ZeroPad1d.py b/test/legacy_test/test_ZeroPad1d.py new file mode 100644 index 0000000000000..31baf6a7cf246 --- /dev/null +++ b/test/legacy_test/test_ZeroPad1d.py @@ -0,0 +1,90 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle +from paddle import to_tensor +from paddle.nn import ZeroPad1D + + +class TestZeroPad1dAPI(unittest.TestCase): + def setUp(self): + if paddle.is_compiled_with_cuda(): + paddle.device.set_device('gpu:0') + else: + paddle.device.set_device('cpu') + self.shape = [4, 6, 6] + self.support_dtypes = ['float32', 'float64', 'int32', 'int64'] + + def test_support_dtypes(self): + for dtype in self.support_dtypes: + pad = 2 + x = np.random.randint(-255, 255, size=self.shape).astype(dtype) + expect_res = np.pad( + x, + [[0, 0], [0, 0], [pad, pad]], + mode='constant', + constant_values=0, + ) + + x_tensor = to_tensor(x).astype(dtype) + zeropad1d = ZeroPad1D(padding=pad) + ret_res = zeropad1d(x_tensor).numpy() + np.testing.assert_allclose(expect_res, ret_res, rtol=1e-05) + + def test_support_pad2(self): + pad = [1, 2] + x = np.random.randint(-255, 255, size=self.shape) + expect_res = np.pad( + x, [[0, 0], [0, 0], pad], mode='constant', constant_values=0 + ) + + x_tensor = to_tensor(x) + zeropad1d = ZeroPad1D(padding=pad) + ret_res = zeropad1d(x_tensor).numpy() + np.testing.assert_allclose(expect_res, ret_res, rtol=1e-05) + + def test_support_pad3(self): + pad = (1, 2) + x = np.random.randint(-255, 255, size=self.shape) + expect_res = np.pad(x, [[0, 0], [0, 0], [pad[0], pad[1]]]) + + x_tensor = to_tensor(x) + zeropad1d = ZeroPad1D(padding=pad) + ret_res = zeropad1d(x_tensor).numpy() + np.testing.assert_allclose(expect_res, ret_res, rtol=1e-05) + + def test_support_pad4(self): + pad = [1, 2] + x = np.random.randint(-255, 255, size=self.shape) + expect_res = np.pad(x, [[0, 0], [0, 0], [pad[0], pad[1]]]) + + x_tensor = to_tensor(x) + pad_tensor = to_tensor(pad, dtype='int32') + zeropad1d = ZeroPad1D(padding=pad_tensor) + ret_res = zeropad1d(x_tensor).numpy() + np.testing.assert_allclose(expect_res, ret_res, rtol=1e-05) + + def test_repr(self): + pad = [1, 2] + zeropad1d = ZeroPad1D(padding=pad) + name_str = zeropad1d.extra_repr() + assert name_str == 'padding=[1, 2], data_format=NCL' + + +if __name__ == '__main__': + unittest.main() diff --git a/test/legacy_test/test_ZeroPad3d.py b/test/legacy_test/test_ZeroPad3d.py new file mode 100644 index 0000000000000..8cc7a45c959df --- /dev/null +++ b/test/legacy_test/test_ZeroPad3d.py @@ -0,0 +1,117 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle +from paddle import to_tensor +from paddle.nn import ZeroPad3D + + +class TestZeroPad3DAPI(unittest.TestCase): + def setUp(self): + if paddle.is_compiled_with_cuda(): + paddle.device.set_device('gpu:0') + else: + paddle.device.set_device('cpu') + self.shape = [4, 3, 6, 6, 6] + self.support_dtypes = ['float32', 'float64', 'int32', 'int64'] + + def test_support_dtypes(self): + for dtype in self.support_dtypes: + pad = 2 + x = np.random.randint(-255, 255, size=self.shape).astype(dtype) + expect_res = np.pad( + x, + [[0, 0], [0, 0], [pad, pad], [pad, pad], [pad, pad]], + mode='constant', + constant_values=0, + ) + + x_tensor = to_tensor(x).astype(dtype) + zeropad3d = ZeroPad3D(padding=pad) + ret_res = zeropad3d(x_tensor).numpy() + np.testing.assert_allclose(expect_res, ret_res, rtol=1e-05) + + def test_support_pad2(self): + pad = [1, 2, 3, 4, 5, 6] + x = np.random.randint(-255, 255, size=self.shape) + expect_res = np.pad( + x, + [ + [0, 0], + [0, 0], + [pad[4], pad[5]], + [pad[2], pad[3]], + [pad[0], pad[1]], + ], + mode='constant', + constant_values=0, + ) + + x_tensor = to_tensor(x) + zeropad3d = ZeroPad3D(padding=pad) + ret_res = zeropad3d(x_tensor).numpy() + np.testing.assert_allclose(expect_res, ret_res, rtol=1e-05) + + def test_support_pad3(self): + pad = (1, 2, 3, 4, 5, 6) + x = np.random.randint(-255, 255, size=self.shape) + expect_res = np.pad( + x, + [ + [0, 0], + [0, 0], + [pad[4], pad[5]], + [pad[2], pad[3]], + [pad[0], pad[1]], + ], + ) + + x_tensor = to_tensor(x) + zeropad3d = ZeroPad3D(padding=pad) + ret_res = zeropad3d(x_tensor).numpy() + np.testing.assert_allclose(expect_res, ret_res, rtol=1e-05) + + def test_support_pad4(self): + pad = [1, 2, 3, 4, 5, 6] + x = np.random.randint(-255, 255, size=self.shape) + expect_res = np.pad( + x, + [ + [0, 0], + [0, 0], + [pad[4], pad[5]], + [pad[2], pad[3]], + [pad[0], pad[1]], + ], + ) + + x_tensor = to_tensor(x) + pad_tensor = to_tensor(pad, dtype='int32') + zeropad3d = ZeroPad3D(padding=pad_tensor) + ret_res = zeropad3d(x_tensor).numpy() + np.testing.assert_allclose(expect_res, ret_res, rtol=1e-05) + + def test_repr(self): + pad = pad = [1, 2, 3, 4, 5, 6] + zeropad3d = ZeroPad3D(padding=pad) + name_str = zeropad3d.extra_repr() + assert name_str == 'padding=[1, 2, 3, 4, 5, 6], data_format=NCDHW' + + +if __name__ == '__main__': + unittest.main() diff --git a/test/deprecated/legacy_test/test_accuracy_op.py b/test/legacy_test/test_accuracy_op.py similarity index 99% rename from test/deprecated/legacy_test/test_accuracy_op.py rename to test/legacy_test/test_accuracy_op.py index 44c4cfa7c49ac..bf6d86d10da9e 100755 --- a/test/deprecated/legacy_test/test_accuracy_op.py +++ b/test/legacy_test/test_accuracy_op.py @@ -126,7 +126,7 @@ def test_type_errors(self): self.assertRaises(TypeError, paddle.metric.accuracy, x2, label) x3 = paddle.static.data( - name='input', shape=[-1, 2], dtype="float16" + name='input', shape=[-1, 2], dtype="float32" ) paddle.static.accuracy(input=x3, label=label) paddle.metric.accuracy(input=x3, label=label) diff --git a/test/legacy_test/test_activation_op.py b/test/legacy_test/test_activation_op.py index 7806017bbfeed..5e727c7580580 100644 --- a/test/legacy_test/test_activation_op.py +++ b/test/legacy_test/test_activation_op.py @@ -365,19 +365,19 @@ def test_out_name(self): data = paddle.static.data( name="X", shape=[-1, 1], dtype="float32" ) - out = eval("paddle.%s(data, name='Y')" % self.op_type) + out = eval(f"paddle.{self.op_type}(data, name='Y')") place = base.CPUPlace() exe = base.Executor(place) (result,) = exe.run(feed={"X": np_x}, fetch_list=[out]) - expected = eval("np.%s(np_x)" % self.op_type) + expected = eval(f"np.{self.op_type}(np_x)") np.testing.assert_allclose(result, expected, rtol=1e-05) def test_dygraph(self): with base.dygraph.guard(): np_x = np.array([0.1]) x = paddle.to_tensor(np_x) - z = eval("paddle.%s(x).numpy()" % self.op_type) - z_expected = eval("np.%s(np_x)" % self.op_type) + z = eval(f"paddle.{self.op_type}(x).numpy()") + z_expected = eval(f"np.{self.op_type}(np_x)") np.testing.assert_allclose(z, z_expected, rtol=1e-05) @@ -5359,7 +5359,7 @@ def create_test_act_fp16_class( enable_cinn=False, check_pir=False, grad_atol=1e-2, - **kwargs + **kwargs, ): @unittest.skipIf( not paddle.is_compiled_with_cuda(), "core is not compiled with CUDA" @@ -5556,7 +5556,7 @@ def create_test_act_bf16_class( check_pir=False, check_prim_pir=False, grad_atol=1e-2, - **kwargs + **kwargs, ): @unittest.skipIf( not core.is_compiled_with_cuda() diff --git a/test/deprecated/legacy_test/test_add_position_encoding_op.py b/test/legacy_test/test_add_position_encoding_op.py similarity index 100% rename from test/deprecated/legacy_test/test_add_position_encoding_op.py rename to test/legacy_test/test_add_position_encoding_op.py diff --git a/test/deprecated/legacy_test/test_addmm_op.py b/test/legacy_test/test_addmm_op.py similarity index 100% rename from test/deprecated/legacy_test/test_addmm_op.py rename to test/legacy_test/test_addmm_op.py diff --git a/test/deprecated/legacy_test/test_affine_channel_op.py b/test/legacy_test/test_affine_channel_op.py similarity index 100% rename from test/deprecated/legacy_test/test_affine_channel_op.py rename to test/legacy_test/test_affine_channel_op.py diff --git a/test/deprecated/legacy_test/test_affine_grid_op.py b/test/legacy_test/test_affine_grid_op.py similarity index 100% rename from test/deprecated/legacy_test/test_affine_grid_op.py rename to test/legacy_test/test_affine_grid_op.py diff --git a/test/deprecated/legacy_test/test_assign_op.py b/test/legacy_test/test_assign_op.py similarity index 100% rename from test/deprecated/legacy_test/test_assign_op.py rename to test/legacy_test/test_assign_op.py diff --git a/test/deprecated/legacy_test/test_atan2_op.py b/test/legacy_test/test_atan2_op.py similarity index 100% rename from test/deprecated/legacy_test/test_atan2_op.py rename to test/legacy_test/test_atan2_op.py diff --git a/test/deprecated/legacy_test/test_attribute_var.py b/test/legacy_test/test_attribute_var.py similarity index 82% rename from test/deprecated/legacy_test/test_attribute_var.py rename to test/legacy_test/test_attribute_var.py index e06e8a3d80d50..cdae49ba0741a 100644 --- a/test/deprecated/legacy_test/test_attribute_var.py +++ b/test/legacy_test/test_attribute_var.py @@ -66,43 +66,6 @@ def infer_prog(self): return res -class TestDropout(UnittestBase): - def init_info(self): - self.shapes = [[10, 10]] - self.save_path = os.path.join(self.temp_dir.name, 'dropout') - - def test_static(self): - main_prog = Program() - startup_prog = Program() - with program_guard(main_prog, startup_prog): - fc = paddle.nn.Linear(10, 10) - x = paddle.randn(self.shapes[0]) - x.stop_gradient = False - feat = fc(x) - # p is a Variable - p = paddle.randn([1]) - out = paddle.nn.functional.dropout(feat, p=p) - sgd = paddle.optimizer.SGD() - sgd.minimize(paddle.mean(out)) - # test _to_string - self.assertTrue("Var[" in str(main_prog)) - - exe = paddle.static.Executor() - exe.run(startup_prog) - res = exe.run(fetch_list=[x, out]) - # export model - paddle.static.save_inference_model(self.save_path, [x], [out], exe) - - # Test for Inference Predictor - infer_out = self.infer_prog() - self.assertEqual(infer_out.shape, (10, 10)) - - self.assertEqual( - main_prog.block(0).ops[4].all_attrs()['dropout_prob'].name, - p.name, - ) - - class TestTileTensorList(UnittestBase): def init_info(self): self.shapes = [[2, 3, 4]] diff --git a/test/deprecated/legacy_test/test_bce_loss.py b/test/legacy_test/test_bce_loss.py similarity index 100% rename from test/deprecated/legacy_test/test_bce_loss.py rename to test/legacy_test/test_bce_loss.py diff --git a/test/deprecated/legacy_test/test_bicubic_interp_op.py b/test/legacy_test/test_bicubic_interp_op.py similarity index 100% rename from test/deprecated/legacy_test/test_bicubic_interp_op.py rename to test/legacy_test/test_bicubic_interp_op.py diff --git a/test/deprecated/legacy_test/test_bilinear_interp_op.py b/test/legacy_test/test_bilinear_interp_op.py similarity index 100% rename from test/deprecated/legacy_test/test_bilinear_interp_op.py rename to test/legacy_test/test_bilinear_interp_op.py diff --git a/test/legacy_test/test_block_diag.py b/test/legacy_test/test_block_diag.py new file mode 100644 index 0000000000000..842f360f33c4b --- /dev/null +++ b/test/legacy_test/test_block_diag.py @@ -0,0 +1,95 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import scipy + +import paddle +from paddle import base + + +class TestBlockDiagError(unittest.TestCase): + def test_errors(self): + def test_type_error(): + A = np.array([[1, 2], [3, 4]]) + B = np.array([[5, 6], [7, 8]]) + C = np.array([[9, 10], [11, 12]]) + with paddle.static.program_guard(base.Program()): + out = paddle.block_diag([A, B, C]) + + self.assertRaises(TypeError, test_type_error) + + def test_dime_error(): + A = paddle.to_tensor([[[1, 2], [3, 4]]]) + B = paddle.to_tensor([[[5, 6], [7, 8]]]) + C = paddle.to_tensor([[[9, 10], [11, 12]]]) + with paddle.static.program_guard(base.Program()): + out = paddle.block_diag([A, B, C]) + + self.assertRaises(ValueError, test_dime_error) + + +class TestBlockDiag(unittest.TestCase): + def setUp(self): + paddle.seed(2024) + self.type_list = ['int32', 'int64', 'float32', 'float64'] + self.place = [('cpu', paddle.CPUPlace())] + ( + [('gpu', paddle.CUDAPlace(0))] + if paddle.is_compiled_with_cuda() + else [] + ) + + def test_dygraph(self): + paddle.disable_static() + for device, place in self.place: + paddle.set_device(device) + for i in self.type_list: + A = np.random.randn(2, 3).astype(i) + B = np.random.randn(2).astype(i) + C = np.random.randn(4, 1).astype(i) + s_out = scipy.linalg.block_diag(A, B, C) + + A_tensor = paddle.to_tensor(A) + B_tensor = paddle.to_tensor(B) + C_tensor = paddle.to_tensor(C) + out = paddle.block_diag([A_tensor, B_tensor, C_tensor]) + np.testing.assert_allclose(out.numpy(), s_out) + + def test_static(self): + paddle.enable_static() + for device, place in self.place: + paddle.set_device(device) + for i in self.type_list: + A = np.random.randn(2, 3).astype(i) + B = np.random.randn(2).astype(i) + C = np.random.randn(4, 1).astype(i) + s_out = scipy.linalg.block_diag(A, B, C) + + with paddle.static.program_guard(paddle.static.Program()): + A_tensor = paddle.static.data('A', [2, 3], i) + B_tensor = paddle.static.data('B', [2], i) + C_tensor = paddle.static.data('C', [4, 1], i) + out = paddle.block_diag([A_tensor, B_tensor, C_tensor]) + exe = paddle.static.Executor(place) + res = exe.run( + feed={'A': A, 'B': B, 'C': C}, + fetch_list=[out], + ) + np.testing.assert_allclose(res[0], s_out) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/deprecated/legacy_test/test_bmm_op.py b/test/legacy_test/test_bmm_op.py similarity index 100% rename from test/deprecated/legacy_test/test_bmm_op.py rename to test/legacy_test/test_bmm_op.py diff --git a/test/deprecated/legacy_test/test_cast_op.py b/test/legacy_test/test_cast_op.py similarity index 100% rename from test/deprecated/legacy_test/test_cast_op.py rename to test/legacy_test/test_cast_op.py diff --git a/test/deprecated/legacy_test/test_channel_shuffle.py b/test/legacy_test/test_channel_shuffle.py similarity index 100% rename from test/deprecated/legacy_test/test_channel_shuffle.py rename to test/legacy_test/test_channel_shuffle.py diff --git a/test/legacy_test/test_cholesky_op.py b/test/legacy_test/test_cholesky_op.py index d98596fc29c89..25fc0f9365299 100644 --- a/test/legacy_test/test_cholesky_op.py +++ b/test/legacy_test/test_cholesky_op.py @@ -121,14 +121,14 @@ def func(self, place): for i in range(len(out)): yi = out[i] dy = paddle.static.data( - name='dys_%s' % i, + name=f'dys_{i}', shape=yi.shape, dtype=root_data.dtype, ) dy.stop_gradient = False dy.persistable = True value = np.zeros(yi.shape, dtype=root_data.dtype) - feeds.update({'dys_%s' % i: value}) + feeds.update({f'dys_{i}': value}) dys.append(dy) fetch_list = base.gradients(out, root, dys) grad_check( diff --git a/test/legacy_test/test_collective_api_base.py b/test/legacy_test/test_collective_api_base.py index fa31fe1e16b54..dfc5c36a7eb5a 100644 --- a/test/legacy_test/test_collective_api_base.py +++ b/test/legacy_test/test_collective_api_base.py @@ -201,7 +201,7 @@ def setUp(self): self._trainers = 2 self._ps_endpoints = f"127.0.0.1:{self._find_free_port()},127.0.0.1:{self._find_free_port()}" self._python_interp = sys.executable - self._master_endpoints = "127.0.0.1:%s" % (self._find_free_port()) + self._master_endpoints = f"127.0.0.1:{self._find_free_port()}" self.temp_dir = tempfile.TemporaryDirectory() @@ -305,15 +305,15 @@ def _run_cluster(self, model_file, envs): tr0_out, tr0_err = tr0_proc.communicate() tr1_out, tr1_err = tr1_proc.communicate() - sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err) - sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err) + sys.stderr.write(f'trainer 0 stderr: {tr0_err}\n') + sys.stderr.write(f'trainer 1 stderr: {tr1_err}\n') # close trainer file tr0_pipe.close() tr1_pipe.close() with open(path0, "r") as f: - sys.stderr.write('trainer 0 stderr file: %s\n' % f.read()) + sys.stderr.write(f'trainer 0 stderr file: {f.read()}\n') with open(path1, "r") as f: - sys.stderr.write('trainer 1 stderr file: %s\n' % f.read()) + sys.stderr.write(f'trainer 1 stderr file: {f.read()}\n') def load_and_remove(path): with open(path, 'rb') as f: diff --git a/test/legacy_test/test_collective_base.py b/test/legacy_test/test_collective_base.py index b11b992bcd5f8..07573f6ce7e00 100644 --- a/test/legacy_test/test_collective_base.py +++ b/test/legacy_test/test_collective_base.py @@ -232,8 +232,8 @@ def _run_cluster(self, model_file, envs): tr0_out, tr0_err = tr0_proc.communicate() tr1_out, tr1_err = tr1_proc.communicate() - sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err) - sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err) + sys.stderr.write(f'trainer 0 stderr: {tr0_err}\n') + sys.stderr.write(f'trainer 1 stderr: {tr1_err}\n') # close trainer file tr0_pipe.close() tr1_pipe.close() diff --git a/test/deprecated/legacy_test/test_complex_abs.py b/test/legacy_test/test_complex_abs.py similarity index 100% rename from test/deprecated/legacy_test/test_complex_abs.py rename to test/legacy_test/test_complex_abs.py diff --git a/test/deprecated/legacy_test/test_complex_op.py b/test/legacy_test/test_complex_op.py similarity index 100% rename from test/deprecated/legacy_test/test_complex_op.py rename to test/legacy_test/test_complex_op.py diff --git a/test/deprecated/legacy_test/test_complex_variable.py b/test/legacy_test/test_complex_variable.py similarity index 100% rename from test/deprecated/legacy_test/test_complex_variable.py rename to test/legacy_test/test_complex_variable.py diff --git a/test/deprecated/legacy_test/test_complex_view_op.py b/test/legacy_test/test_complex_view_op.py similarity index 100% rename from test/deprecated/legacy_test/test_complex_view_op.py rename to test/legacy_test/test_complex_view_op.py diff --git a/test/deprecated/legacy_test/test_conj_op.py b/test/legacy_test/test_conj_op.py similarity index 100% rename from test/deprecated/legacy_test/test_conj_op.py rename to test/legacy_test/test_conj_op.py diff --git a/test/legacy_test/test_conv2d_op.py b/test/legacy_test/test_conv2d_op.py index a3bfa75d1225f..b0b0d0abe2d96 100644 --- a/test/legacy_test/test_conv2d_op.py +++ b/test/legacy_test/test_conv2d_op.py @@ -34,14 +34,14 @@ def conv2d_forward_naive( ): if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]: raise ValueError( - "Unknown Attr(padding_algorithm): '%s'. " - "It can only be 'SAME' or 'VALID'." % str(padding_algorithm) + f"Unknown Attr(padding_algorithm): '{str(padding_algorithm)}'. " + "It can only be 'SAME' or 'VALID'." ) if data_format not in ["NCHW", "NHWC"]: raise ValueError( - "Unknown Attr(data_format): '%s' ." - "It can only be 'NCHW' or 'NHWC'." % str(data_format) + f"Unknown Attr(data_format): '{str(data_format)}' ." + "It can only be 'NCHW' or 'NHWC'." ) channel_last = data_format == "NHWC" diff --git a/test/deprecated/legacy_test/test_conv2d_op_depthwise_conv.py b/test/legacy_test/test_conv2d_op_depthwise_conv.py similarity index 100% rename from test/deprecated/legacy_test/test_conv2d_op_depthwise_conv.py rename to test/legacy_test/test_conv2d_op_depthwise_conv.py diff --git a/test/legacy_test/test_conv2d_transpose_op.py b/test/legacy_test/test_conv2d_transpose_op.py index 36796adfdaec2..dd14afecf09ec 100644 --- a/test/legacy_test/test_conv2d_transpose_op.py +++ b/test/legacy_test/test_conv2d_transpose_op.py @@ -37,8 +37,8 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs): padding_algorithm = attrs['padding_algorithm'] if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]: raise ValueError( - "Unknown Attr(padding_algorithm): '%s'. " - "It can only be 'SAME' or 'VALID'." % str(padding_algorithm) + f"Unknown Attr(padding_algorithm): '{str(padding_algorithm)}'. " + "It can only be 'SAME' or 'VALID'." ) if attrs['data_format'] == 'NHWC': diff --git a/test/deprecated/legacy_test/test_conv2d_transpose_op_depthwise_conv.py b/test/legacy_test/test_conv2d_transpose_op_depthwise_conv.py similarity index 100% rename from test/deprecated/legacy_test/test_conv2d_transpose_op_depthwise_conv.py rename to test/legacy_test/test_conv2d_transpose_op_depthwise_conv.py diff --git a/test/legacy_test/test_conv3d_op.py b/test/legacy_test/test_conv3d_op.py index 143deb493c756..a41580c7b0445 100644 --- a/test/legacy_test/test_conv3d_op.py +++ b/test/legacy_test/test_conv3d_op.py @@ -37,14 +37,14 @@ def conv3d_forward_naive( ): if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]: raise ValueError( - "Unknown Attr(padding_algorithm): '%s'. " - "It can only be 'SAME' or 'VALID'." % str(padding_algorithm) + f"Unknown Attr(padding_algorithm): '{str(padding_algorithm)}'. " + "It can only be 'SAME' or 'VALID'." ) if data_format not in ["NCDHW", "NDHWC"]: raise ValueError( - "Unknown Attr(data_format): '%s' ." - "It can only be 'NCDHW' or 'NDHWC'." % str(data_format) + f"Unknown Attr(data_format): '{str(data_format)}' ." + "It can only be 'NCDHW' or 'NDHWC'." ) channel_last = data_format == "NDHWC" diff --git a/test/legacy_test/test_conv3d_transpose_op.py b/test/legacy_test/test_conv3d_transpose_op.py index 78d88d53ff500..9e6f3445eaf99 100644 --- a/test/legacy_test/test_conv3d_transpose_op.py +++ b/test/legacy_test/test_conv3d_transpose_op.py @@ -42,8 +42,8 @@ def conv3dtranspose_forward_naive(input_, filter_, attrs): padding_algorithm = attrs['padding_algorithm'] if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]: raise ValueError( - "Unknown Attr(padding_algorithm): '%s'. " - "It can only be 'SAME' or 'VALID'." % str(padding_algorithm) + f"Unknown Attr(padding_algorithm): '{str(padding_algorithm)}'. " + "It can only be 'SAME' or 'VALID'." ) if attrs['data_format'] == 'NHWC': diff --git a/test/deprecated/legacy_test/test_copysign_op.py b/test/legacy_test/test_copysign_op.py similarity index 100% rename from test/deprecated/legacy_test/test_copysign_op.py rename to test/legacy_test/test_copysign_op.py diff --git a/test/deprecated/legacy_test/test_crop_tensor_op.py b/test/legacy_test/test_crop_tensor_op.py similarity index 100% rename from test/deprecated/legacy_test/test_crop_tensor_op.py rename to test/legacy_test/test_crop_tensor_op.py diff --git a/test/deprecated/legacy_test/test_cross_entropy2_op.py b/test/legacy_test/test_cross_entropy2_op.py similarity index 100% rename from test/deprecated/legacy_test/test_cross_entropy2_op.py rename to test/legacy_test/test_cross_entropy2_op.py diff --git a/test/deprecated/legacy_test/test_cross_entropy_op.py b/test/legacy_test/test_cross_entropy_op.py similarity index 100% rename from test/deprecated/legacy_test/test_cross_entropy_op.py rename to test/legacy_test/test_cross_entropy_op.py diff --git a/test/deprecated/legacy_test/test_cummax_op.py b/test/legacy_test/test_cummax_op.py similarity index 100% rename from test/deprecated/legacy_test/test_cummax_op.py rename to test/legacy_test/test_cummax_op.py diff --git a/test/deprecated/legacy_test/test_cumprod_op.py b/test/legacy_test/test_cumprod_op.py similarity index 100% rename from test/deprecated/legacy_test/test_cumprod_op.py rename to test/legacy_test/test_cumprod_op.py diff --git a/test/deprecated/legacy_test/test_deformable_conv_v1_op.py b/test/legacy_test/test_deformable_conv_v1_op.py similarity index 100% rename from test/deprecated/legacy_test/test_deformable_conv_v1_op.py rename to test/legacy_test/test_deformable_conv_v1_op.py diff --git a/test/deprecated/legacy_test/test_determinant_op.py b/test/legacy_test/test_determinant_op.py similarity index 100% rename from test/deprecated/legacy_test/test_determinant_op.py rename to test/legacy_test/test_determinant_op.py diff --git a/test/deprecated/legacy_test/test_diagonal_op.py b/test/legacy_test/test_diagonal_op.py similarity index 100% rename from test/deprecated/legacy_test/test_diagonal_op.py rename to test/legacy_test/test_diagonal_op.py diff --git a/test/legacy_test/test_dist_base.py b/test/legacy_test/test_dist_base.py index 0abf18fe42c87..143f7e1ee8e62 100755 --- a/test/legacy_test/test_dist_base.py +++ b/test/legacy_test/test_dist_base.py @@ -1040,7 +1040,7 @@ def __free_port(): ) as s: s.bind(('', 0)) print_to_err( - type(self).__name__, "socket name: %s" % s.getsockname()[1] + type(self).__name__, f"socket name: {s.getsockname()[1]}" ) return s.getsockname()[1] @@ -1479,10 +1479,9 @@ def _get_nccl2_trainer_cmd( def _run_cluster_gloo( self, model, envs, update_method, check_error_log, log_name ): - assert update_method == "gloo", ( - "_run_cluster_gloo must have update_method: gloo, but get %s" - % update_method - ) + assert ( + update_method == "gloo" + ), f"_run_cluster_gloo must have update_method: gloo, but get {update_method}" assert ( not self._use_hallreduce ), "_run_cluster_gloo must have _use_hallreduce = false" @@ -1551,9 +1550,7 @@ def _run_cluster_nccl2( if DIST_UT_PORT == 0: # NOTE(wangxi). hallreduce test must use 4cards after nccl>=2.7 for i in range(0, 4): - self._ps_endpoints += "127.0.0.1:%s," % ( - self._find_free_port() - ) + self._ps_endpoints += f"127.0.0.1:{self._find_free_port()}," else: for i in range(0, 4): self._ps_endpoints += "127.0.0.1:%s," % (DIST_UT_PORT + i) diff --git a/test/legacy_test/test_dist_hapi_model.py b/test/legacy_test/test_dist_hapi_model.py index 03a92d6f3cbc9..e41f5b344a594 100644 --- a/test/legacy_test/test_dist_hapi_model.py +++ b/test/legacy_test/test_dist_hapi_model.py @@ -70,9 +70,11 @@ def start_local_trainers( procs = [] for t in pod.trainers: proc_env = { - "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in t.gpus]), + "FLAGS_selected_gpus": "{}".format( + ",".join([str(g) for g in t.gpus]) + ), "PADDLE_TRAINER_ID": "%d" % t.rank, - "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint, + "PADDLE_CURRENT_ENDPOINT": f"{t.endpoint}", "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()), "FLAGS_dynamic_static_unified_comm": "0", diff --git a/test/deprecated/legacy_test/test_eigh_op.py b/test/legacy_test/test_eigh_op.py similarity index 100% rename from test/deprecated/legacy_test/test_eigh_op.py rename to test/legacy_test/test_eigh_op.py diff --git a/test/deprecated/legacy_test/test_elementwise_heaviside_op.py b/test/legacy_test/test_elementwise_heaviside_op.py similarity index 100% rename from test/deprecated/legacy_test/test_elementwise_heaviside_op.py rename to test/legacy_test/test_elementwise_heaviside_op.py diff --git a/test/deprecated/legacy_test/test_elementwise_mul_op.py b/test/legacy_test/test_elementwise_mul_op.py similarity index 100% rename from test/deprecated/legacy_test/test_elementwise_mul_op.py rename to test/legacy_test/test_elementwise_mul_op.py diff --git a/test/deprecated/legacy_test/test_elementwise_pow_op.py b/test/legacy_test/test_elementwise_pow_op.py similarity index 100% rename from test/deprecated/legacy_test/test_elementwise_pow_op.py rename to test/legacy_test/test_elementwise_pow_op.py diff --git a/test/deprecated/legacy_test/test_expand_v2_op.py b/test/legacy_test/test_expand_v2_op.py similarity index 100% rename from test/deprecated/legacy_test/test_expand_v2_op.py rename to test/legacy_test/test_expand_v2_op.py diff --git a/test/deprecated/legacy_test/test_fill_any_op.py b/test/legacy_test/test_fill_any_op.py similarity index 100% rename from test/deprecated/legacy_test/test_fill_any_op.py rename to test/legacy_test/test_fill_any_op.py diff --git a/test/deprecated/legacy_test/test_fill_diagonal_tensor_op.py b/test/legacy_test/test_fill_diagonal_tensor_op.py similarity index 100% rename from test/deprecated/legacy_test/test_fill_diagonal_tensor_op.py rename to test/legacy_test/test_fill_diagonal_tensor_op.py diff --git a/test/deprecated/legacy_test/test_flatten_contiguous_range_op.py b/test/legacy_test/test_flatten_contiguous_range_op.py similarity index 100% rename from test/deprecated/legacy_test/test_flatten_contiguous_range_op.py rename to test/legacy_test/test_flatten_contiguous_range_op.py diff --git a/test/deprecated/legacy_test/test_flip.py b/test/legacy_test/test_flip.py similarity index 100% rename from test/deprecated/legacy_test/test_flip.py rename to test/legacy_test/test_flip.py diff --git a/test/deprecated/legacy_test/test_fmax_op.py b/test/legacy_test/test_fmax_op.py similarity index 100% rename from test/deprecated/legacy_test/test_fmax_op.py rename to test/legacy_test/test_fmax_op.py diff --git a/test/deprecated/legacy_test/test_fmin_op.py b/test/legacy_test/test_fmin_op.py similarity index 100% rename from test/deprecated/legacy_test/test_fmin_op.py rename to test/legacy_test/test_fmin_op.py diff --git a/test/deprecated/legacy_test/test_fold_op.py b/test/legacy_test/test_fold_op.py similarity index 100% rename from test/deprecated/legacy_test/test_fold_op.py rename to test/legacy_test/test_fold_op.py diff --git a/test/deprecated/legacy_test/test_fractional_max_pool2d_op.py b/test/legacy_test/test_fractional_max_pool2d_op.py similarity index 100% rename from test/deprecated/legacy_test/test_fractional_max_pool2d_op.py rename to test/legacy_test/test_fractional_max_pool2d_op.py diff --git a/test/deprecated/legacy_test/test_full_like_op.py b/test/legacy_test/test_full_like_op.py similarity index 100% rename from test/deprecated/legacy_test/test_full_like_op.py rename to test/legacy_test/test_full_like_op.py diff --git a/test/legacy_test/test_fused_groupnorm.py b/test/legacy_test/test_fused_groupnorm.py new file mode 100644 index 0000000000000..5dbaa4d5a569d --- /dev/null +++ b/test/legacy_test/test_fused_groupnorm.py @@ -0,0 +1,321 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest + +import numpy as np + +import paddle +import paddle.nn.functional as F +from paddle import base +from paddle.base import core +from paddle.base.layer_helper import LayerHelper + + +def naive_residual_add(x, residual): + return np.add(x, residual) + + +def naive_group_norm(x, scale, bias, epsilon, groups, data_layout): + dim = x.ndim + if dim == 3: + if data_layout == "NHWC": + x = np.transpose(x, (0, 2, 1)) # NLC => NCL + N, C, L = x.shape + G = groups + x = x.reshape((N * G, -1)) + mean = np.mean(x, axis=1, keepdims=True) + var = np.var(x, axis=1, keepdims=True) + output = (x - mean) / np.sqrt(var + epsilon) + output = output.reshape((N, C, L)) * scale.reshape( + (-1, 1) + ) + bias.reshape((-1, 1)) + if data_layout == "NHWC": + output = np.transpose(output, (0, 2, 1)) # NCL => NLC + return [output, mean.reshape((N, G)), var.reshape((N, G))] + elif dim == 4: + if data_layout == "NHWC": + x = np.transpose(x, (0, 3, 1, 2)) # NHWC => NCHW + N, C, H, W = x.shape + G = groups + x = x.reshape((N * G, -1)) + mean = np.mean(x, axis=1, keepdims=True) + var = np.var(x, axis=1, keepdims=True) + output = (x - mean) / np.sqrt(var + epsilon) + output = output.reshape((N, C, H, W)) * scale.reshape( + (-1, 1, 1) + ) + bias.reshape((-1, 1, 1)) + if data_layout == "NHWC": + output = np.transpose(output, (0, 2, 3, 1)) # NCHW => NHWC + return [output, mean.reshape((N, G)), var.reshape((N, G))] + else: + if data_layout == "NHWC": + x = np.transpose(x, (0, 4, 1, 2, 3)) # NDHWC => NCDHW + N, C, D, H, W = x.shape + G = groups + x = x.reshape((N * G, -1)) + mean = np.mean(x, axis=1, keepdims=True) + var = np.var(x, axis=1, keepdims=True) + output = (x - mean) / np.sqrt(var + epsilon) + output = output.reshape((N, C, D, H, W)) * scale.reshape( + (-1, 1, 1, 1) + ) + bias.reshape((-1, 1, 1, 1)) + if data_layout == "NHWC": + output = np.transpose(output, (0, 2, 3, 4, 1)) # NCDHW => NDHWC + return [output, mean.reshape((N, G)), var.reshape((N, G))] + + +def naive_residual_biasadd_layer_norm( + x, residual, scale, bias, epsilon, groups, data_layout, activation +): + x = x + residual + out = naive_group_norm(x, scale, bias, epsilon, groups, data_layout) + if activation == "silu": + out[0] = F.silu(paddle.to_tensor(out[0])).numpy() + return out + + +def add_group_norm_silu_static_wrapper( + x, residual, scale, bias, epsilon, groups, data_layout="NHWC", activation="" +): + helper = LayerHelper('add_group_norm_silu', **locals()) + mean_out = helper.create_variable_for_type_inference( + dtype=x.dtype, stop_gradient=True + ) + variance_out = helper.create_variable_for_type_inference( + dtype=x.dtype, stop_gradient=True + ) + + inputs = {'x': x} + if bias is not None: + inputs['bias'] = bias + if scale is not None: + inputs['scale'] = scale + if residual is not None: + inputs['residual'] = residual + + # create output + group_norm_out = helper.create_variable_for_type_inference(dtype=x.dtype) + residual_out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op( + type="add_group_norm_silu", + inputs=inputs, + outputs={ + "y": group_norm_out, + "residual_out": residual_out, + "mean": mean_out, + "variance": variance_out, + }, + attrs={ + "epsilon": epsilon, + "groups": groups, + "data_format": data_layout, + "activation": activation, + }, + ) + + return group_norm_out, residual_out + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_float16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA or not support the bfloat16", +) +class TestGroupNormNHWC_StaticOp(unittest.TestCase): + def setUp(self): + np.random.seed(20) + self.shape = (2, 4, 2, 6) + self.r_shape = (1, 1, 1, 6) + self.x_np = np.random.uniform(-0.05, 0.05, self.shape) + self.residual_np = np.random.uniform(-0.05, 0.05, self.r_shape) + self.scale_np = np.random.uniform(-0.05, 0.05, [self.shape[-1]]) + self.bias_np = np.random.uniform(-0.05, 0.05, [self.shape[-1]]) + self.epsilon = 1e-5 + self.groups = 2 + self.data_layout = 'NHWC' + self.activation = '' + self.place = paddle.CUDAPlace(0) + + def check_residual_add_groupnorm( + self, x_np, scale_np, bias_np, residual_np, activation, dtype + ): + paddle.disable_static() + navie_groupnorm_out = naive_residual_biasadd_layer_norm( + x_np, + residual_np, + scale_np, + bias_np, + self.epsilon, + self.groups, + self.data_layout, + self.activation, + ) + navie_residual_out = naive_residual_add(x_np, residual_np) + paddle.enable_static() + + with paddle.static.program_guard(paddle.static.Program()): + x_static = paddle.static.data( + name="x_static", shape=self.shape, dtype=dtype + ) + residual_static = paddle.static.data( + name="residual_static", + shape=self.r_shape, + dtype=dtype, + ) + + scale_static = paddle.static.data( + name="scale_static", shape=[self.shape[-1]], dtype=dtype + ) + bias_static = paddle.static.data( + name="bias_static", shape=[self.shape[-1]], dtype=dtype + ) + outs = add_group_norm_silu_static_wrapper( + x_static, + residual_static, + scale_static, + bias_static, + self.epsilon, + self.groups, + self.data_layout, + activation, + ) + + exe = base.Executor(self.place) + out_s = exe.run( + feed={ + "x_static": x_np.astype(dtype), + "scale_static": scale_np.astype(dtype), + "residual_static": residual_np.astype(dtype), + "bias_static": bias_np.astype(dtype), + }, + fetch_list=[outs], + ) + return (out_s[0], out_s[1]), navie_groupnorm_out, navie_residual_out + + def test_residual_add_groupnorm_fp16(self): + if not paddle.is_compiled_with_cuda(): + return + self.dtype = np.float16 + ( + paddle_group_list, + paddle_naive_group_out, + paddle_naive_group_residual, + ) = self.check_residual_add_groupnorm( + self.x_np.astype(self.dtype), + self.scale_np.astype(self.dtype), + self.bias_np.astype(self.dtype), + self.residual_np.astype(self.dtype), + self.activation, + self.dtype, + ) + np.testing.assert_allclose( + paddle_group_list[1], + paddle_naive_group_residual, + rtol=1e-5, + atol=1e-5, + ) + np.testing.assert_allclose( + paddle_group_list[0], + paddle_naive_group_out[0], + rtol=1e-4, + atol=1e-4, + ) + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_float16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA or not support the bfloat16", +) +class TestGroupNormNHWCSilu_StaticOp(TestGroupNormNHWC_StaticOp): + def setUp(self): + np.random.seed(20) + self.shape = (2, 4, 2, 6) + self.r_shape = (1, 1, 1, 6) + self.x_np = np.random.uniform(-0.05, 0.05, self.shape) + self.residual_np = np.random.uniform(-0.05, 0.05, self.r_shape) + self.scale_np = np.random.uniform(-0.05, 0.05, [self.shape[-1]]) + self.bias_np = np.random.uniform(-0.05, 0.05, [self.shape[-1]]) + self.epsilon = 1e-5 + self.groups = 2 + self.data_layout = 'NHWC' + self.activation = 'silu' + self.place = paddle.CUDAPlace(0) + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_float16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA or not support the bfloat16", +) +class TestGroupNormNHWC_StaticOp_1(TestGroupNormNHWC_StaticOp): + def setUp(self): + np.random.seed(20) + self.shape = (2, 4, 2, 6) + self.r_shape = (2, 4, 2, 6) + self.x_np = np.random.uniform(-0.05, 0.05, self.shape) + self.residual_np = np.random.uniform(-0.05, 0.05, self.r_shape) + self.scale_np = np.random.uniform(-0.05, 0.05, [self.shape[-1]]) + self.bias_np = np.random.uniform(-0.05, 0.05, [self.shape[-1]]) + self.epsilon = 1e-5 + self.groups = 2 + self.data_layout = 'NHWC' + self.activation = 'silu' + self.place = paddle.CUDAPlace(0) + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_float16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA or not support the bfloat16", +) +class TestGroupNormNHWCSilu_StaticOp_1(TestGroupNormNHWC_StaticOp): + def setUp(self): + np.random.seed(20) + self.shape = (2, 4, 2, 6) + self.r_shape = (2, 4, 2, 6) + self.x_np = np.random.uniform(-0.05, 0.05, self.shape) + self.residual_np = np.random.uniform(-0.05, 0.05, self.r_shape) + self.scale_np = np.random.uniform(-0.05, 0.05, [self.shape[-1]]) + self.bias_np = np.random.uniform(-0.05, 0.05, [self.shape[-1]]) + self.epsilon = 1e-5 + self.groups = 2 + self.data_layout = 'NHWC' + self.activation = 'silu' + self.place = paddle.CUDAPlace(0) + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_float16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA or not support the bfloat16", +) +class TestGroupNormNHWCSingleC_StaticOp(TestGroupNormNHWC_StaticOp): + def setUp(self): + np.random.seed(20) + self.shape = (2, 4, 2, 6) + self.r_shape = (2, 4, 2, 6) + self.x_np = np.random.uniform(-0.05, 0.05, self.shape) + self.residual_np = np.random.uniform(-0.05, 0.05, self.r_shape) + self.scale_np = np.random.uniform(-0.05, 0.05, [self.shape[-1]]) + self.bias_np = np.random.uniform(-0.05, 0.05, [self.shape[-1]]) + self.epsilon = 1e-5 + self.groups = 6 + self.data_layout = 'NHWC' + self.activation = '' + self.place = paddle.CUDAPlace(0) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/deprecated/legacy_test/test_gammaln_op.py b/test/legacy_test/test_gammaln_op.py similarity index 100% rename from test/deprecated/legacy_test/test_gammaln_op.py rename to test/legacy_test/test_gammaln_op.py diff --git a/test/deprecated/legacy_test/test_gaussian_random_op.py b/test/legacy_test/test_gaussian_random_op.py similarity index 100% rename from test/deprecated/legacy_test/test_gaussian_random_op.py rename to test/legacy_test/test_gaussian_random_op.py diff --git a/test/deprecated/legacy_test/test_graph_send_recv_op.py b/test/legacy_test/test_graph_send_recv_op.py similarity index 100% rename from test/deprecated/legacy_test/test_graph_send_recv_op.py rename to test/legacy_test/test_graph_send_recv_op.py diff --git a/test/deprecated/legacy_test/test_graph_send_ue_recv_op.py b/test/legacy_test/test_graph_send_ue_recv_op.py similarity index 100% rename from test/deprecated/legacy_test/test_graph_send_ue_recv_op.py rename to test/legacy_test/test_graph_send_ue_recv_op.py diff --git a/test/deprecated/legacy_test/test_graph_send_uv_op.py b/test/legacy_test/test_graph_send_uv_op.py similarity index 100% rename from test/deprecated/legacy_test/test_graph_send_uv_op.py rename to test/legacy_test/test_graph_send_uv_op.py diff --git a/test/deprecated/legacy_test/test_grid_sampler_op.py b/test/legacy_test/test_grid_sampler_op.py similarity index 100% rename from test/deprecated/legacy_test/test_grid_sampler_op.py rename to test/legacy_test/test_grid_sampler_op.py diff --git a/test/legacy_test/test_group_norm_op.py b/test/legacy_test/test_group_norm_op.py index f097df3b0b99c..7a6f57cc61ece 100644 --- a/test/legacy_test/test_group_norm_op.py +++ b/test/legacy_test/test_group_norm_op.py @@ -209,7 +209,7 @@ def do_compare_between_place(self): gpu_grads, inputs_to_check, 0.005, - "Gradient Check On %s" % str(place), + f"Gradient Check On {str(place)}", ) def test_check_grad(self): @@ -1748,7 +1748,7 @@ def test_jit_comp(self): fwd_actual[i], rtol=rtol, atol=atol, - err_msg='%s jit fwd' % self.places[i], + err_msg=f'{self.places[i]} jit fwd', ) # TODO: fix the diff between cpu and gpu grad is large in original op @@ -1762,7 +1762,7 @@ def test_jit_comp(self): rev_actual[i], rtol=rtol, atol=atol, - err_msg='%s jit rev' % self.places[i], + err_msg=f'{self.places[i]} jit rev', ) def test_jit_comp_with_cinn(self): @@ -1820,7 +1820,7 @@ def test_jit_comp_with_cinn(self): fwd_actual[i], rtol=rtol, # mean of uniform distribution, scale for avoid random failed atol=atol, - err_msg='%s jit_cinn fwd' % self.places[i], + err_msg=f'{self.places[i]} jit_cinn fwd', ) # TODO: fix the diff between cpu and gpu grad is large in original op # now use larger threshold when testing cpu grads to bypass cpu grad test @@ -1832,7 +1832,7 @@ def test_jit_comp_with_cinn(self): rev_actual[i], rtol=rtol, # mean of uniform distribution, scale for avoid random failed atol=atol, - err_msg='%s jit_cinn rev' % self.places[i], + err_msg=f'{self.places[i]} jit_cinn rev', ) i += 1 diff --git a/test/deprecated/legacy_test/test_gru_op.py b/test/legacy_test/test_gru_op.py similarity index 100% rename from test/deprecated/legacy_test/test_gru_op.py rename to test/legacy_test/test_gru_op.py diff --git a/test/deprecated/legacy_test/test_gru_unit_op.py b/test/legacy_test/test_gru_unit_op.py similarity index 100% rename from test/deprecated/legacy_test/test_gru_unit_op.py rename to test/legacy_test/test_gru_unit_op.py diff --git a/test/deprecated/legacy_test/test_gumbel_softmax_op.py b/test/legacy_test/test_gumbel_softmax_op.py similarity index 100% rename from test/deprecated/legacy_test/test_gumbel_softmax_op.py rename to test/legacy_test/test_gumbel_softmax_op.py diff --git a/test/deprecated/legacy_test/test_hinge_loss_op.py b/test/legacy_test/test_hinge_loss_op.py similarity index 100% rename from test/deprecated/legacy_test/test_hinge_loss_op.py rename to test/legacy_test/test_hinge_loss_op.py diff --git a/test/deprecated/legacy_test/test_huber_loss_op.py b/test/legacy_test/test_huber_loss_op.py similarity index 100% rename from test/deprecated/legacy_test/test_huber_loss_op.py rename to test/legacy_test/test_huber_loss_op.py diff --git a/test/deprecated/legacy_test/test_identity_loss_op.py b/test/legacy_test/test_identity_loss_op.py similarity index 100% rename from test/deprecated/legacy_test/test_identity_loss_op.py rename to test/legacy_test/test_identity_loss_op.py diff --git a/test/deprecated/legacy_test/test_im2sequence_op.py b/test/legacy_test/test_im2sequence_op.py similarity index 100% rename from test/deprecated/legacy_test/test_im2sequence_op.py rename to test/legacy_test/test_im2sequence_op.py diff --git a/test/legacy_test/test_imperative_deepcf.py b/test/legacy_test/test_imperative_deepcf.py index 301ec4e0a468e..31e94078c7ca8 100644 --- a/test/legacy_test/test_imperative_deepcf.py +++ b/test/legacy_test/test_imperative_deepcf.py @@ -188,7 +188,7 @@ def get_data(self): ) def load_data(self): - sys.stderr.write('loading from %s\n' % self.data_path) + sys.stderr.write(f'loading from {self.data_path}\n') likes = {} num_users = -1 num_items = -1 @@ -299,7 +299,7 @@ def test_deefcf(self): }, fetch_list=[loss], )[0] - sys.stderr.write('static loss %s\n' % static_loss) + sys.stderr.write(f'static loss {static_loss}\n') with base.dygraph.guard(): paddle.seed(seed) diff --git a/test/deprecated/legacy_test/test_imperative_framework.py b/test/legacy_test/test_imperative_framework.py similarity index 77% rename from test/deprecated/legacy_test/test_imperative_framework.py rename to test/legacy_test/test_imperative_framework.py index 01f6d37eed4b1..b85eeb11df517 100644 --- a/test/deprecated/legacy_test/test_imperative_framework.py +++ b/test/legacy_test/test_imperative_framework.py @@ -15,7 +15,6 @@ import unittest import numpy as np -from test_imperative_base import new_program_scope import paddle from paddle import base @@ -53,21 +52,13 @@ def forward(self, inputs): class TestDygraphFramework(unittest.TestCase): - def test_dygraph_backward(self): - with new_program_scope(): - mlp = MLP(input_size=2) - var_inp = paddle.static.data("input", shape=[2, 2], dtype="float32") - out = mlp(var_inp) - try: - out.backward() - raise AssertionError( - "backward should not be usable in static graph mode" - ) - except AssertionError as e: - self.assertTrue(e is not None) - def test_dygraph_to_string(self): np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) with base.dygraph.guard(): var_inp = paddle.to_tensor(np_inp) print(str(var_inp)) + + +if __name__ == '__main__': + paddle.disable_static() + unittest.main() diff --git a/test/deprecated/legacy_test/test_imperative_star_gan_with_gradient_penalty.py b/test/legacy_test/test_imperative_star_gan_with_gradient_penalty.py similarity index 100% rename from test/deprecated/legacy_test/test_imperative_star_gan_with_gradient_penalty.py rename to test/legacy_test/test_imperative_star_gan_with_gradient_penalty.py diff --git a/test/deprecated/legacy_test/test_index_add_op.py b/test/legacy_test/test_index_add_op.py similarity index 100% rename from test/deprecated/legacy_test/test_index_add_op.py rename to test/legacy_test/test_index_add_op.py diff --git a/test/deprecated/legacy_test/test_index_sample_op.py b/test/legacy_test/test_index_sample_op.py similarity index 100% rename from test/deprecated/legacy_test/test_index_sample_op.py rename to test/legacy_test/test_index_sample_op.py diff --git a/test/deprecated/legacy_test/test_index_select_op.py b/test/legacy_test/test_index_select_op.py similarity index 100% rename from test/deprecated/legacy_test/test_index_select_op.py rename to test/legacy_test/test_index_select_op.py diff --git a/test/deprecated/legacy_test/test_input_spec.py b/test/legacy_test/test_input_spec.py similarity index 96% rename from test/deprecated/legacy_test/test_input_spec.py rename to test/legacy_test/test_input_spec.py index 8f86d002da306..aa649b58ca2a8 100644 --- a/test/deprecated/legacy_test/test_input_spec.py +++ b/test/legacy_test/test_input_spec.py @@ -35,9 +35,17 @@ def test_default(self): self.assertIsNone(tensor_spec.name) def test_from_tensor(self): - x_bool = paddle.tensor.fill_constant( - shape=[1], dtype='bool', value=True - ) + if paddle.framework.use_pir_api(): + x_bool = paddle.pir.core.create_parameter( + dtype='float32', + shape=[1], + name='xx', + initializer=paddle.nn.initializer.Uniform(), + ) + else: + x_bool = paddle.tensor.fill_constant( + shape=[1], dtype='bool', value=True + ) bool_spec = InputSpec.from_tensor(x_bool) self.assertEqual(bool_spec.dtype, x_bool.dtype) self.assertEqual(list(bool_spec.shape), list(x_bool.shape)) diff --git a/test/deprecated/legacy_test/test_instance_norm_op_v2.py b/test/legacy_test/test_instance_norm_op_v2.py similarity index 100% rename from test/deprecated/legacy_test/test_instance_norm_op_v2.py rename to test/legacy_test/test_instance_norm_op_v2.py diff --git a/test/deprecated/legacy_test/test_is_integer.py b/test/legacy_test/test_is_integer.py similarity index 100% rename from test/deprecated/legacy_test/test_is_integer.py rename to test/legacy_test/test_is_integer.py diff --git a/test/legacy_test/test_isin.py b/test/legacy_test/test_isin.py new file mode 100644 index 0000000000000..101d89b4de84f --- /dev/null +++ b/test/legacy_test/test_isin.py @@ -0,0 +1,327 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +from op_test import convert_float_to_uint16 + +import paddle +from paddle import base +from paddle.base import core +from paddle.pir_utils import test_with_pir_api + +DATA_CASES = [ + {'x_data': np.array(1.0), 'test_x_data': np.array(-1.0)}, + { + 'x_data': np.random.randint(-10, 10, (4, 8)), + 'test_x_data': np.random.randint(0, 20, (2, 3)), + }, + { + 'x_data': np.random.randint(-50, 50, (8, 64)), + 'test_x_data': np.random.randint(-20, 0, (4, 256)), + }, +] + +DATA_CASES_UNIQUE = [ + { + 'x_data': np.arange(0, 1000).reshape([2, 5, 100]), + 'test_x_data': np.arange(200, 700), + }, + { + 'x_data': np.arange(-100, 100).reshape([2, 2, 5, 10]), + 'test_x_data': np.arange(50, 150).reshape([4, 5, 5]), + }, +] + +DATA_CASES_BF16 = [ + {'x_data': np.array(1.0), 'test_x_data': np.array(0.0)}, + { + 'x_data': np.random.randint(0, 10, (4, 8)), + 'test_x_data': np.random.randint(5, 15, (2, 3)), + }, + { + 'x_data': np.random.randint(0, 50, (8, 64)), + 'test_x_data': np.random.randint(0, 20, (4, 256)), + }, +] + + +DATA_CASES_UNIQUE_BF16 = [ + { + 'x_data': np.arange(0, 100).reshape([2, 5, 10]), + 'test_x_data': np.arange(50, 150), + }, +] + + +DATA_TYPE = ['float32', 'float64', 'int32', 'int64'] + + +def run_dygraph( + x_data, + test_x_data, + type, + assume_unique=False, + invert=False, + use_gpu=False, +): + place = paddle.CPUPlace() + if use_gpu and base.core.is_compiled_with_cuda(): + place = paddle.CUDAPlace(0) + paddle.disable_static(place) + x_data = x_data.astype(type) + test_x_data = test_x_data.astype(type) + x_e = paddle.to_tensor(x_data) + x_t = paddle.to_tensor(test_x_data) + return paddle.isin(x_e, x_t, assume_unique, invert) + + +def run_static( + x_data, + test_x_data, + type, + assume_unique=False, + invert=False, + use_gpu=False, +): + paddle.enable_static() + startup_program = paddle.static.Program() + main_program = paddle.static.Program() + place = paddle.CPUPlace() + if use_gpu and base.core.is_compiled_with_cuda(): + place = paddle.CUDAPlace(0) + exe = base.Executor(place) + with paddle.static.program_guard(main_program, startup_program): + x_data = x_data.astype(type) + test_x_data = test_x_data.astype(type) + x_e = paddle.static.data(name='x_e', shape=x_data.shape, dtype=type) + x_t = paddle.static.data( + name='x_t', shape=test_x_data.shape, dtype=type + ) + res = paddle.isin(x_e, x_t, assume_unique, invert) + static_result = exe.run( + feed={'x_e': x_data, 'x_t': test_x_data}, + fetch_list=[res], + ) + return static_result + + +def test( + data_cases, type_cases, assume_unique=False, invert=False, use_gpu=False +): + for type in type_cases: + for case in data_cases: + x_data = case['x_data'] + test_x_data = case['test_x_data'] + dygraph_result = run_dygraph( + x_data, + test_x_data, + type, + assume_unique, + invert, + use_gpu, + ).numpy() + np_result = np.isin( + x_data.astype(type), + test_x_data.astype(type), + assume_unique=assume_unique, + invert=invert, + ) + np.testing.assert_equal(dygraph_result, np_result) + + @test_with_pir_api + def test_static(): + (static_result,) = run_static( + x_data, + test_x_data, + type, + assume_unique, + invert, + use_gpu, + ) + np.testing.assert_equal(static_result, np_result) + + test_static() + + +def run_dygraph_bf16( + x_data, + test_x_data, + assume_unique=False, + invert=False, + use_gpu=False, +): + place = paddle.CPUPlace() + if use_gpu and base.core.is_compiled_with_cuda(): + place = paddle.CUDAPlace(0) + paddle.disable_static(place) + x_e = paddle.to_tensor(convert_float_to_uint16(x_data)) + x_t = paddle.to_tensor(convert_float_to_uint16(test_x_data)) + return paddle.isin(x_e, x_t, assume_unique, invert) + + +def run_static_bf16( + x_data, + test_x_data, + assume_unique=False, + invert=False, + use_gpu=False, +): + paddle.enable_static() + startup_program = paddle.static.Program() + main_program = paddle.static.Program() + place = paddle.CPUPlace() + if use_gpu and base.core.is_compiled_with_cuda(): + place = paddle.CUDAPlace(0) + exe = base.Executor(place) + with paddle.static.program_guard(main_program, startup_program): + x_data = convert_float_to_uint16(x_data) + test_x_data = convert_float_to_uint16(test_x_data) + x_e = paddle.static.data( + name='x_e', shape=x_data.shape, dtype=np.uint16 + ) + x_t = paddle.static.data( + name='x_t', shape=test_x_data.shape, dtype=np.uint16 + ) + res = paddle.isin(x_e, x_t, assume_unique, invert) + static_result = exe.run( + feed={'x_e': x_data, 'x_t': test_x_data}, + fetch_list=[res], + ) + return static_result + + +def test_bf16(data_cases, assume_unique=False, invert=False, use_gpu=False): + for case in data_cases: + x_data = case['x_data'].astype("float32") + test_x_data = case['test_x_data'].astype("float32") + dygraph_result = run_dygraph_bf16( + x_data, + test_x_data, + assume_unique, + invert, + use_gpu, + ).numpy() + np_result = np.isin( + x_data, + test_x_data, + assume_unique=assume_unique, + invert=invert, + ) + np.testing.assert_equal(dygraph_result, np_result) + + @test_with_pir_api + def test_static(): + (static_result,) = run_static_bf16( + x_data, + test_x_data, + assume_unique, + invert, + use_gpu, + ) + np.testing.assert_equal(static_result, np_result) + + test_static() + + +class TestIsInError(unittest.TestCase): + def test_for_exception(self): + with self.assertRaises(TypeError): + paddle.isin(np.array([1, 2]), np.array([1, 2])) + + +class TestIsIn(unittest.TestCase): + def test_without_gpu(self): + test(DATA_CASES, DATA_TYPE) + + def test_with_gpu(self): + test(DATA_CASES, DATA_TYPE, use_gpu=True) + + def test_invert_without_gpu(self): + test(DATA_CASES, DATA_TYPE, invert=True) + + def test_invert_with_gpu(self): + test(DATA_CASES, DATA_TYPE, invert=True, use_gpu=True) + + def test_unique_without_gpu(self): + test(DATA_CASES_UNIQUE, DATA_TYPE, assume_unique=True) + + def test_unique_with_gpu(self): + test(DATA_CASES_UNIQUE, DATA_TYPE, assume_unique=True, use_gpu=True) + + def test_unique_invert_without_gpu(self): + test(DATA_CASES_UNIQUE, DATA_TYPE, assume_unique=True, invert=True) + + def test_unique_invert_with_gpu(self): + test( + DATA_CASES_UNIQUE, + DATA_TYPE, + assume_unique=True, + invert=True, + use_gpu=True, + ) + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_float16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA and not support the float16", +) +class TestIsInFP16(unittest.TestCase): + def test_default(self): + test(DATA_CASES, ['float16'], use_gpu=True) + + def test_invert(self): + test(DATA_CASES, ['float16'], invert=True, use_gpu=True) + + def test_unique(self): + test(DATA_CASES_UNIQUE, ['float16'], assume_unique=True, use_gpu=True) + + def test_unique_invert(self): + test( + DATA_CASES_UNIQUE, + ['float16'], + assume_unique=True, + invert=True, + use_gpu=True, + ) + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_float16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA and not support the float16", +) +class TestIsInBF16(unittest.TestCase): + def test_default(self): + test_bf16(DATA_CASES_BF16, use_gpu=True) + + def test_invert(self): + test_bf16(DATA_CASES_BF16, invert=True, use_gpu=True) + + def test_unique(self): + test_bf16(DATA_CASES_UNIQUE_BF16, assume_unique=True, use_gpu=True) + + def test_unique_invert(self): + test_bf16( + DATA_CASES_UNIQUE_BF16, + assume_unique=True, + invert=True, + use_gpu=True, + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/legacy_test/test_jit_save_load.py b/test/legacy_test/test_jit_save_load.py index 09f5a7b9a4e4b..04b86c6864685 100644 --- a/test/legacy_test/test_jit_save_load.py +++ b/test/legacy_test/test_jit_save_load.py @@ -329,7 +329,6 @@ def train(layer, input_size=784, label_size=1): for data in train_loader(): img, label = data label.stop_gradient = True - cost = layer(img) loss = paddle.nn.functional.cross_entropy( @@ -396,6 +395,8 @@ def train_and_save_model(self, model_path=None): @test_with_dygraph_pir def test_save_load(self): # train and save model + if not paddle.framework.use_pir_api(): + return train_layer = self.train_and_save_model() # load model loaded_layer = paddle.jit.load(self.model_path) @@ -496,6 +497,7 @@ def setUp(self): def tearDown(self): self.temp_dir.cleanup() + @test_with_dygraph_pir def test_output_same_order(self): x = paddle.to_tensor(np.random.random((4, 8)).astype('float32')) @@ -1712,6 +1714,7 @@ def setUp(self): def tearDown(self): self.temp_dir.cleanup() + @test_with_dygraph_pir def test_save_load_finetune_load(self): model_path = os.path.join( self.temp_dir.name, "test_jit_save_load_save_without_running/model" @@ -1788,7 +1791,6 @@ def forward(self, x): return y -''' class TestJitSaveLoadFinetuneLoad(unittest.TestCase): def setUp(self): # enable dygraph mode @@ -1798,8 +1800,10 @@ def setUp(self): def tearDown(self): self.temp_dir.cleanup() - #@test_with_dygraph_pir + @test_with_dygraph_pir def test_save_load_finetune_load(self): + if not paddle.framework.use_pir_api(): + return model_path = os.path.join( self.temp_dir.name, "test_jit_save_load_finetune_load/model" ) @@ -1830,7 +1834,6 @@ def test_save_load_finetune_load(self): self.assertTrue(float((result_00 - result_10).abs().max()) < 1e-5) self.assertTrue(float((result_01 - result_11).abs().max()) < 1e-5) -''' # NOTE(weixin): When there are multiple test functions in an diff --git a/test/deprecated/legacy_test/test_kldiv_loss_op.py b/test/legacy_test/test_kldiv_loss_op.py similarity index 100% rename from test/deprecated/legacy_test/test_kldiv_loss_op.py rename to test/legacy_test/test_kldiv_loss_op.py diff --git a/test/deprecated/legacy_test/test_kron_op.py b/test/legacy_test/test_kron_op.py similarity index 100% rename from test/deprecated/legacy_test/test_kron_op.py rename to test/legacy_test/test_kron_op.py diff --git a/test/deprecated/legacy_test/test_kthvalue_op.py b/test/legacy_test/test_kthvalue_op.py similarity index 100% rename from test/deprecated/legacy_test/test_kthvalue_op.py rename to test/legacy_test/test_kthvalue_op.py diff --git a/test/deprecated/legacy_test/test_l1_norm_op.py b/test/legacy_test/test_l1_norm_op.py similarity index 100% rename from test/deprecated/legacy_test/test_l1_norm_op.py rename to test/legacy_test/test_l1_norm_op.py diff --git a/test/deprecated/legacy_test/test_label_smooth_op.py b/test/legacy_test/test_label_smooth_op.py similarity index 100% rename from test/deprecated/legacy_test/test_label_smooth_op.py rename to test/legacy_test/test_label_smooth_op.py diff --git a/test/deprecated/legacy_test/test_lerp_op.py b/test/legacy_test/test_lerp_op.py similarity index 100% rename from test/deprecated/legacy_test/test_lerp_op.py rename to test/legacy_test/test_lerp_op.py diff --git a/test/deprecated/legacy_test/test_lgamma_op.py b/test/legacy_test/test_lgamma_op.py similarity index 100% rename from test/deprecated/legacy_test/test_lgamma_op.py rename to test/legacy_test/test_lgamma_op.py diff --git a/test/deprecated/legacy_test/test_linear_interp_op.py b/test/legacy_test/test_linear_interp_op.py similarity index 100% rename from test/deprecated/legacy_test/test_linear_interp_op.py rename to test/legacy_test/test_linear_interp_op.py diff --git a/test/deprecated/legacy_test/test_linear_interp_v2_op.py b/test/legacy_test/test_linear_interp_v2_op.py similarity index 100% rename from test/deprecated/legacy_test/test_linear_interp_v2_op.py rename to test/legacy_test/test_linear_interp_v2_op.py diff --git a/test/deprecated/legacy_test/test_load_state_dict_from_old_format.py b/test/legacy_test/test_load_state_dict_from_old_format.py similarity index 100% rename from test/deprecated/legacy_test/test_load_state_dict_from_old_format.py rename to test/legacy_test/test_load_state_dict_from_old_format.py diff --git a/test/deprecated/legacy_test/test_log_loss_op.py b/test/legacy_test/test_log_loss_op.py similarity index 100% rename from test/deprecated/legacy_test/test_log_loss_op.py rename to test/legacy_test/test_log_loss_op.py diff --git a/test/deprecated/legacy_test/test_log_softmax.py b/test/legacy_test/test_log_softmax.py similarity index 100% rename from test/deprecated/legacy_test/test_log_softmax.py rename to test/legacy_test/test_log_softmax.py diff --git a/test/deprecated/legacy_test/test_logsumexp.py b/test/legacy_test/test_logsumexp.py similarity index 100% rename from test/deprecated/legacy_test/test_logsumexp.py rename to test/legacy_test/test_logsumexp.py diff --git a/test/deprecated/legacy_test/test_lr_scheduler.py b/test/legacy_test/test_lr_scheduler.py similarity index 100% rename from test/deprecated/legacy_test/test_lr_scheduler.py rename to test/legacy_test/test_lr_scheduler.py diff --git a/test/deprecated/legacy_test/test_lrn_op.py b/test/legacy_test/test_lrn_op.py similarity index 100% rename from test/deprecated/legacy_test/test_lrn_op.py rename to test/legacy_test/test_lrn_op.py diff --git a/test/legacy_test/test_lstm_cudnn_op.py b/test/legacy_test/test_lstm_cudnn_op.py index ade1f61c0d5a9..3362297747b63 100644 --- a/test/legacy_test/test_lstm_cudnn_op.py +++ b/test/legacy_test/test_lstm_cudnn_op.py @@ -35,7 +35,7 @@ class RandomWeight: def __init__(self): pass - def updata_weight(self, hidden_size, input_size, dtype): + def update_weight(self, hidden_size, input_size, dtype): std = 1.0 / math.sqrt(hidden_size) self.hidden_size = hidden_size self.input_size = input_size @@ -432,7 +432,7 @@ def setUp(self): input[9][3:][:] = 0 input[8][4:][:] = 0 - weight.updata_weight(hidden_size, input_size, self.dtype) + weight.update_weight(hidden_size, input_size, self.dtype) rnn1 = LSTM( input_size, hidden_size, diff --git a/test/deprecated/legacy_test/test_lstm_op.py b/test/legacy_test/test_lstm_op.py similarity index 100% rename from test/deprecated/legacy_test/test_lstm_op.py rename to test/legacy_test/test_lstm_op.py diff --git a/test/deprecated/legacy_test/test_lu_unpack_op.py b/test/legacy_test/test_lu_unpack_op.py similarity index 100% rename from test/deprecated/legacy_test/test_lu_unpack_op.py rename to test/legacy_test/test_lu_unpack_op.py diff --git a/test/deprecated/legacy_test/test_masked_scatter.py b/test/legacy_test/test_masked_scatter.py similarity index 100% rename from test/deprecated/legacy_test/test_masked_scatter.py rename to test/legacy_test/test_masked_scatter.py diff --git a/test/deprecated/legacy_test/test_matmul_op.py b/test/legacy_test/test_matmul_op.py similarity index 100% rename from test/deprecated/legacy_test/test_matmul_op.py rename to test/legacy_test/test_matmul_op.py diff --git a/test/deprecated/legacy_test/test_matmul_v2_op.py b/test/legacy_test/test_matmul_v2_op.py similarity index 100% rename from test/deprecated/legacy_test/test_matmul_v2_op.py rename to test/legacy_test/test_matmul_v2_op.py diff --git a/test/deprecated/legacy_test/test_maxout_op.py b/test/legacy_test/test_maxout_op.py similarity index 100% rename from test/deprecated/legacy_test/test_maxout_op.py rename to test/legacy_test/test_maxout_op.py diff --git a/test/deprecated/legacy_test/test_meshgrid_op.py b/test/legacy_test/test_meshgrid_op.py similarity index 80% rename from test/deprecated/legacy_test/test_meshgrid_op.py rename to test/legacy_test/test_meshgrid_op.py index b72f51cd04144..869e2c4e88281 100644 --- a/test/deprecated/legacy_test/test_meshgrid_op.py +++ b/test/legacy_test/test_meshgrid_op.py @@ -42,16 +42,28 @@ def init_data_type(self): self.dtype = np.float64 def test_check_output(self): - self.check_output(check_prim=True, check_pir=True, check_prim_pir=True) + if self.dtype == np.complex64 or self.dtype == np.complex128: + self.check_output(check_pir=True) + else: + self.check_output( + check_prim=True, check_pir=True, check_prim_pir=True + ) def test_check_grad(self): - self.check_grad( - ['x0'], - ['out0', 'out1'], - check_prim=True, - check_pir=True, - check_prim_pir=True, - ) + if self.dtype == np.complex64 or self.dtype == np.complex128: + self.check_grad( + ['x0'], + ['out0', 'out1'], + check_pir=True, + ) + else: + self.check_grad( + ['x0'], + ['out0', 'out1'], + check_prim=True, + check_pir=True, + check_prim_pir=True, + ) def init_inputs_and_outputs(self): self.shape = self.get_x_shape() @@ -91,6 +103,22 @@ def init_data_type(self): self.dtype = np.float16 +class TestMeshgridOp2Complex64(TestMeshgridOp): + def get_x_shape(self): + return [100, 300] + + def init_data_type(self): + self.dtype = np.complex64 + + +class TestMeshgridOp2Complex128(TestMeshgridOp): + def get_x_shape(self): + return [100, 300] + + def init_data_type(self): + self.dtype = np.complex128 + + @unittest.skipIf( not core.is_compiled_with_cuda() or not core.is_bfloat16_supported(core.CUDAPlace(0)), @@ -336,6 +364,70 @@ def test_api_with_dygraph_tuple_input(self): np.testing.assert_array_equal(res_4.shape, [100, 200]) +class TestMeshgridOpComplexStatic(unittest.TestCase): + @test_with_pir_api + def test_tuple_input(self): + input_1 = np.random.randint( + 0, + 100, + [ + 100, + ], + ).astype('complex64') + input_2 = np.random.randint( + 0, + 100, + [ + 200, + ], + ).astype('complex64') + + out_1 = np.reshape(input_1, [100, 1]) + out_1 = np.broadcast_to(out_1, [100, 200]) + out_2 = np.reshape(input_2, [1, 200]) + out_2 = np.broadcast_to(out_2, [100, 200]) + + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.static.data(shape=[100], dtype='complex64', name='x') + y = paddle.static.data(shape=[200], dtype='complex64', name='y') + + exe = base.Executor(place=base.CPUPlace()) + grid_x, grid_y = paddle.tensor.meshgrid((x, y)) + res_1, res_2 = exe.run( + paddle.static.default_main_program(), + feed={'x': input_1, 'y': input_2}, + fetch_list=[grid_x, grid_y], + ) + np.testing.assert_array_equal(res_1, out_1) + np.testing.assert_array_equal(res_2, out_2) + + +class TestMeshgridOpComplexDygraph(unittest.TestCase): + def test_api_with_dygraph_tuple_input(self): + input_3 = np.random.randint( + 0, + 100, + [ + 100, + ], + ).astype('complex64') + input_4 = np.random.randint( + 0, + 100, + [ + 200, + ], + ).astype('complex64') + + with base.dygraph.guard(): + tensor_3 = paddle.to_tensor(input_3) + tensor_4 = paddle.to_tensor(input_4) + res_3, res_4 = paddle.tensor.meshgrid((tensor_3, tensor_4)) + + np.testing.assert_array_equal(res_3.shape, [100, 200]) + np.testing.assert_array_equal(res_4.shape, [100, 200]) + + class TestMeshGrid_ZeroDim(TestMeshgridOp): def init_inputs_and_outputs(self): self.shape = self.get_x_shape() diff --git a/test/deprecated/legacy_test/test_modified_huber_loss_op.py b/test/legacy_test/test_modified_huber_loss_op.py similarity index 100% rename from test/deprecated/legacy_test/test_modified_huber_loss_op.py rename to test/legacy_test/test_modified_huber_loss_op.py diff --git a/test/deprecated/legacy_test/test_mul_op.py b/test/legacy_test/test_mul_op.py similarity index 100% rename from test/deprecated/legacy_test/test_mul_op.py rename to test/legacy_test/test_mul_op.py diff --git a/test/deprecated/legacy_test/test_multi_dot_op.py b/test/legacy_test/test_multi_dot_op.py similarity index 100% rename from test/deprecated/legacy_test/test_multi_dot_op.py rename to test/legacy_test/test_multi_dot_op.py diff --git a/test/deprecated/legacy_test/test_mv_op.py b/test/legacy_test/test_mv_op.py similarity index 100% rename from test/deprecated/legacy_test/test_mv_op.py rename to test/legacy_test/test_mv_op.py diff --git a/test/deprecated/legacy_test/test_nearest_interp_op.py b/test/legacy_test/test_nearest_interp_op.py similarity index 100% rename from test/deprecated/legacy_test/test_nearest_interp_op.py rename to test/legacy_test/test_nearest_interp_op.py diff --git a/test/deprecated/legacy_test/test_nearest_interp_v2_op.py b/test/legacy_test/test_nearest_interp_v2_op.py similarity index 100% rename from test/deprecated/legacy_test/test_nearest_interp_v2_op.py rename to test/legacy_test/test_nearest_interp_v2_op.py diff --git a/test/deprecated/legacy_test/test_ops_nms.py b/test/legacy_test/test_ops_nms.py similarity index 100% rename from test/deprecated/legacy_test/test_ops_nms.py rename to test/legacy_test/test_ops_nms.py diff --git a/test/deprecated/legacy_test/test_overlap_add_op.py b/test/legacy_test/test_overlap_add_op.py similarity index 100% rename from test/deprecated/legacy_test/test_overlap_add_op.py rename to test/legacy_test/test_overlap_add_op.py diff --git a/test/deprecated/legacy_test/test_pad3d_op.py b/test/legacy_test/test_pad3d_op.py similarity index 100% rename from test/deprecated/legacy_test/test_pad3d_op.py rename to test/legacy_test/test_pad3d_op.py diff --git a/test/deprecated/legacy_test/test_paddle_save_load_binary.py b/test/legacy_test/test_paddle_save_load_binary.py similarity index 100% rename from test/deprecated/legacy_test/test_paddle_save_load_binary.py rename to test/legacy_test/test_paddle_save_load_binary.py diff --git a/test/legacy_test/test_parallel_dygraph_dataparallel.py b/test/legacy_test/test_parallel_dygraph_dataparallel.py index 648f6ddd97ef2..166687ce098e4 100644 --- a/test/legacy_test/test_parallel_dygraph_dataparallel.py +++ b/test/legacy_test/test_parallel_dygraph_dataparallel.py @@ -66,7 +66,7 @@ def start_local_trainers_cpu( proc_env = { "PADDLE_DISTRI_BACKEND": "gloo", "PADDLE_TRAINER_ID": "%d" % rank_id, - "PADDLE_CURRENT_ENDPOINT": "%s" % endpoint, + "PADDLE_CURRENT_ENDPOINT": f"{endpoint}", "PADDLE_TRAINERS_NUM": "%d" % n_rank, "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints), } @@ -118,10 +118,11 @@ def start_local_trainers( procs = [] for t in pod.trainers: proc_env = { - f"FLAGS_selected_{accelerator_type}s": "%s" - % ",".join([str(g) for g in t.gpus]), + f"FLAGS_selected_{accelerator_type}s": "{}".format( + ",".join([str(g) for g in t.gpus]) + ), "PADDLE_TRAINER_ID": "%d" % t.rank, - "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint, + "PADDLE_CURRENT_ENDPOINT": f"{t.endpoint}", "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()), "FLAGS_dynamic_static_unified_comm": "0", diff --git a/test/legacy_test/test_parallel_dygraph_dataparallel_cpuonly.py b/test/legacy_test/test_parallel_dygraph_dataparallel_cpuonly.py index 5a944284414bf..cd1b89e064d6e 100644 --- a/test/legacy_test/test_parallel_dygraph_dataparallel_cpuonly.py +++ b/test/legacy_test/test_parallel_dygraph_dataparallel_cpuonly.py @@ -66,7 +66,7 @@ def start_local_trainers( for t in pod.trainers: proc_env = { "PADDLE_TRAINER_ID": "%d" % t.rank, - "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint, + "PADDLE_CURRENT_ENDPOINT": f"{t.endpoint}", "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()), "MASTER_ADDR": "127.0.0.1", diff --git a/test/deprecated/legacy_test/test_partial_concat_op.py b/test/legacy_test/test_partial_concat_op.py similarity index 100% rename from test/deprecated/legacy_test/test_partial_concat_op.py rename to test/legacy_test/test_partial_concat_op.py diff --git a/test/deprecated/legacy_test/test_partial_sum_op.py b/test/legacy_test/test_partial_sum_op.py similarity index 100% rename from test/deprecated/legacy_test/test_partial_sum_op.py rename to test/legacy_test/test_partial_sum_op.py diff --git a/test/deprecated/legacy_test/test_pixel_shuffle_op.py b/test/legacy_test/test_pixel_shuffle_op.py similarity index 100% rename from test/deprecated/legacy_test/test_pixel_shuffle_op.py rename to test/legacy_test/test_pixel_shuffle_op.py diff --git a/test/deprecated/legacy_test/test_pool3d_op.py b/test/legacy_test/test_pool3d_op.py similarity index 100% rename from test/deprecated/legacy_test/test_pool3d_op.py rename to test/legacy_test/test_pool3d_op.py diff --git a/test/deprecated/legacy_test/test_put_along_axis_op.py b/test/legacy_test/test_put_along_axis_op.py similarity index 100% rename from test/deprecated/legacy_test/test_put_along_axis_op.py rename to test/legacy_test/test_put_along_axis_op.py diff --git a/test/deprecated/legacy_test/test_qr_op.py b/test/legacy_test/test_qr_op.py similarity index 100% rename from test/deprecated/legacy_test/test_qr_op.py rename to test/legacy_test/test_qr_op.py diff --git a/test/deprecated/legacy_test/test_repeat_interleave_op.py b/test/legacy_test/test_repeat_interleave_op.py similarity index 100% rename from test/deprecated/legacy_test/test_repeat_interleave_op.py rename to test/legacy_test/test_repeat_interleave_op.py diff --git a/test/deprecated/legacy_test/test_reshape_op.py b/test/legacy_test/test_reshape_op.py similarity index 100% rename from test/deprecated/legacy_test/test_reshape_op.py rename to test/legacy_test/test_reshape_op.py diff --git a/test/deprecated/legacy_test/test_reverse_op.py b/test/legacy_test/test_reverse_op.py similarity index 100% rename from test/deprecated/legacy_test/test_reverse_op.py rename to test/legacy_test/test_reverse_op.py diff --git a/test/deprecated/legacy_test/test_roi_align_op.py b/test/legacy_test/test_roi_align_op.py similarity index 100% rename from test/deprecated/legacy_test/test_roi_align_op.py rename to test/legacy_test/test_roi_align_op.py diff --git a/test/deprecated/legacy_test/test_roi_pool_op.py b/test/legacy_test/test_roi_pool_op.py similarity index 100% rename from test/deprecated/legacy_test/test_roi_pool_op.py rename to test/legacy_test/test_roi_pool_op.py diff --git a/test/deprecated/legacy_test/test_roll_op.py b/test/legacy_test/test_roll_op.py similarity index 100% rename from test/deprecated/legacy_test/test_roll_op.py rename to test/legacy_test/test_roll_op.py diff --git a/test/deprecated/legacy_test/test_row_conv_op.py b/test/legacy_test/test_row_conv_op.py similarity index 100% rename from test/deprecated/legacy_test/test_row_conv_op.py rename to test/legacy_test/test_row_conv_op.py diff --git a/test/deprecated/legacy_test/test_save_inference_model_conditional_op.py b/test/legacy_test/test_save_inference_model_conditional_op.py similarity index 100% rename from test/deprecated/legacy_test/test_save_inference_model_conditional_op.py rename to test/legacy_test/test_save_inference_model_conditional_op.py diff --git a/test/deprecated/legacy_test/test_save_model_without_var.py b/test/legacy_test/test_save_model_without_var.py similarity index 100% rename from test/deprecated/legacy_test/test_save_model_without_var.py rename to test/legacy_test/test_save_model_without_var.py diff --git a/test/deprecated/legacy_test/test_scatter_op.py b/test/legacy_test/test_scatter_op.py similarity index 100% rename from test/deprecated/legacy_test/test_scatter_op.py rename to test/legacy_test/test_scatter_op.py diff --git a/test/deprecated/legacy_test/test_selu_op.py b/test/legacy_test/test_selu_op.py similarity index 100% rename from test/deprecated/legacy_test/test_selu_op.py rename to test/legacy_test/test_selu_op.py diff --git a/test/deprecated/legacy_test/test_shuffle_channel_op.py b/test/legacy_test/test_shuffle_channel_op.py similarity index 100% rename from test/deprecated/legacy_test/test_shuffle_channel_op.py rename to test/legacy_test/test_shuffle_channel_op.py diff --git a/test/deprecated/legacy_test/test_sigmoid_cross_entropy_with_logits_grad_with_auto_grad.py b/test/legacy_test/test_sigmoid_cross_entropy_with_logits_grad_with_auto_grad.py similarity index 100% rename from test/deprecated/legacy_test/test_sigmoid_cross_entropy_with_logits_grad_with_auto_grad.py rename to test/legacy_test/test_sigmoid_cross_entropy_with_logits_grad_with_auto_grad.py diff --git a/test/deprecated/legacy_test/test_sign_op.py b/test/legacy_test/test_sign_op.py similarity index 100% rename from test/deprecated/legacy_test/test_sign_op.py rename to test/legacy_test/test_sign_op.py diff --git a/test/deprecated/legacy_test/test_solve_op.py b/test/legacy_test/test_solve_op.py similarity index 100% rename from test/deprecated/legacy_test/test_solve_op.py rename to test/legacy_test/test_solve_op.py diff --git a/test/deprecated/legacy_test/test_spectral_norm_op.py b/test/legacy_test/test_spectral_norm_op.py similarity index 100% rename from test/deprecated/legacy_test/test_spectral_norm_op.py rename to test/legacy_test/test_spectral_norm_op.py diff --git a/test/deprecated/legacy_test/test_split_op.py b/test/legacy_test/test_split_op.py similarity index 100% rename from test/deprecated/legacy_test/test_split_op.py rename to test/legacy_test/test_split_op.py diff --git a/test/deprecated/legacy_test/test_static_save_load_large.py b/test/legacy_test/test_static_save_load_large.py similarity index 100% rename from test/deprecated/legacy_test/test_static_save_load_large.py rename to test/legacy_test/test_static_save_load_large.py diff --git a/test/deprecated/legacy_test/test_stft_op.py b/test/legacy_test/test_stft_op.py similarity index 100% rename from test/deprecated/legacy_test/test_stft_op.py rename to test/legacy_test/test_stft_op.py diff --git a/test/deprecated/legacy_test/test_svd_op.py b/test/legacy_test/test_svd_op.py similarity index 100% rename from test/deprecated/legacy_test/test_svd_op.py rename to test/legacy_test/test_svd_op.py diff --git a/test/deprecated/legacy_test/test_swiglu.py b/test/legacy_test/test_swiglu.py similarity index 100% rename from test/deprecated/legacy_test/test_swiglu.py rename to test/legacy_test/test_swiglu.py diff --git a/test/deprecated/legacy_test/test_temporal_shift_op.py b/test/legacy_test/test_temporal_shift_op.py similarity index 100% rename from test/deprecated/legacy_test/test_temporal_shift_op.py rename to test/legacy_test/test_temporal_shift_op.py diff --git a/test/deprecated/legacy_test/test_top_k_op.py b/test/legacy_test/test_top_k_op.py similarity index 100% rename from test/deprecated/legacy_test/test_top_k_op.py rename to test/legacy_test/test_top_k_op.py diff --git a/test/deprecated/legacy_test/test_top_k_v2_op.py b/test/legacy_test/test_top_k_v2_op.py similarity index 100% rename from test/deprecated/legacy_test/test_top_k_v2_op.py rename to test/legacy_test/test_top_k_v2_op.py diff --git a/test/deprecated/legacy_test/test_trace_op.py b/test/legacy_test/test_trace_op.py similarity index 100% rename from test/deprecated/legacy_test/test_trace_op.py rename to test/legacy_test/test_trace_op.py diff --git a/test/deprecated/legacy_test/test_triangular_solve_op.py b/test/legacy_test/test_triangular_solve_op.py similarity index 100% rename from test/deprecated/legacy_test/test_triangular_solve_op.py rename to test/legacy_test/test_triangular_solve_op.py diff --git a/test/deprecated/legacy_test/test_trilinear_interp_op.py b/test/legacy_test/test_trilinear_interp_op.py similarity index 100% rename from test/deprecated/legacy_test/test_trilinear_interp_op.py rename to test/legacy_test/test_trilinear_interp_op.py diff --git a/test/deprecated/legacy_test/test_trilinear_interp_v2_op.py b/test/legacy_test/test_trilinear_interp_v2_op.py similarity index 100% rename from test/deprecated/legacy_test/test_trilinear_interp_v2_op.py rename to test/legacy_test/test_trilinear_interp_v2_op.py diff --git a/test/deprecated/legacy_test/test_trunc_op.py b/test/legacy_test/test_trunc_op.py similarity index 100% rename from test/deprecated/legacy_test/test_trunc_op.py rename to test/legacy_test/test_trunc_op.py diff --git a/test/deprecated/legacy_test/test_unfold_op.py b/test/legacy_test/test_unfold_op.py similarity index 100% rename from test/deprecated/legacy_test/test_unfold_op.py rename to test/legacy_test/test_unfold_op.py diff --git a/test/deprecated/legacy_test/test_unique_consecutive_op.py b/test/legacy_test/test_unique_consecutive_op.py similarity index 100% rename from test/deprecated/legacy_test/test_unique_consecutive_op.py rename to test/legacy_test/test_unique_consecutive_op.py diff --git a/test/deprecated/legacy_test/test_unpool3d_op.py b/test/legacy_test/test_unpool3d_op.py similarity index 100% rename from test/deprecated/legacy_test/test_unpool3d_op.py rename to test/legacy_test/test_unpool3d_op.py diff --git a/test/deprecated/legacy_test/test_unpool_op.py b/test/legacy_test/test_unpool_op.py similarity index 100% rename from test/deprecated/legacy_test/test_unpool_op.py rename to test/legacy_test/test_unpool_op.py diff --git a/test/deprecated/legacy_test/test_unstack_op.py b/test/legacy_test/test_unstack_op.py similarity index 100% rename from test/deprecated/legacy_test/test_unstack_op.py rename to test/legacy_test/test_unstack_op.py diff --git a/test/deprecated/legacy_test/test_yolov3_loss_op.py b/test/legacy_test/test_yolov3_loss_op.py similarity index 100% rename from test/deprecated/legacy_test/test_yolov3_loss_op.py rename to test/legacy_test/test_yolov3_loss_op.py diff --git a/test/prim/pir_prim/CMakeLists.txt b/test/prim/pir_prim/CMakeLists.txt index 108cc3b8b28da..300cd1948f6b8 100644 --- a/test/prim/pir_prim/CMakeLists.txt +++ b/test/prim/pir_prim/CMakeLists.txt @@ -12,7 +12,8 @@ set(TEST_PRIM_PURE_PIR_CASES test_auto_recompute test_auto_recompute_dy2static test_prim_sub_graph_dynamic_shape - test_decompose_control_flow) + test_decompose_control_flow + test_decomp_whole_program) foreach(target ${TEST_PRIM_PURE_PIR_CASES}) py_test_modules( @@ -52,6 +53,7 @@ if(WITH_CINN) FLAGS_prim_check_ops=true FLAGS_enable_pir_api=true FLAGS_prim_enable_dynamic=true + FLAGS_prim_vjp_skip_default_ops=false FLAGS_cinn_bucket_compile=True FLAGS_pir_apply_shape_optimization_pass=1) set_tests_properties(${target} PROPERTIES LABELS "RUN_TYPE=CINN") diff --git a/test/prim/pir_prim/test_decomp_whole_program.py b/test/prim/pir_prim/test_decomp_whole_program.py index f8c58ef7c2469..7d0b28edf5dad 100644 --- a/test/prim/pir_prim/test_decomp_whole_program.py +++ b/test/prim/pir_prim/test_decomp_whole_program.py @@ -40,7 +40,8 @@ def base_net(self, flag=None): y.stop_gradient = False x1 = paddle.sin(x) y1 = paddle.cos(y) - tmp1 = paddle.matmul(x1, y1) + y3 = paddle.matmul(x1, y1) + tmp1 = paddle.concat((x1, y1, y3)) tmp2 = paddle.mean(tmp1) sum_out = paddle.sin(tmp2) gradients = grad(sum_out, (x, y)) @@ -54,17 +55,18 @@ def base_net(self, flag=None): whole_ops = [op.name() for op in main_program.global_block().ops] if flag == "prim": - assert 'pd_op.matmul_grad' not in whole_ops + assert 'pd_op.concat_grad' not in whole_ops else: - assert 'pd_op.matmul_grad' in whole_ops + assert 'pd_op.concat_grad' in whole_ops return fwd, dx, dy def test_prim_all(self): + paddle.base.core._set_prim_backward_blacklist("sin_grad", "cos_grad") res_ref = self.base_net() res = self.base_net("prim") for ref, actual in zip(res_ref, res): - np.testing.assert_allclose(ref, actual, rtol=1e-6) + np.testing.assert_allclose(ref, actual, rtol=1e-6, atol=1e-6) if __name__ == "__main__": diff --git a/test/quantization/test_imperative_qat_lsq.py b/test/quantization/test_imperative_qat_lsq.py index c71bd02c56bbc..bd16d309b249c 100644 --- a/test/quantization/test_imperative_qat_lsq.py +++ b/test/quantization/test_imperative_qat_lsq.py @@ -213,7 +213,7 @@ def func_qat(self): print('eval_acc_top1', eval_acc_top1) self.assertTrue( eval_acc_top1 > 0.9, - msg="The test acc {%f} is less than 0.9." % eval_acc_top1, + msg=f"The test acc {{{eval_acc_top1:f}}} is less than 0.9.", ) def test_qat(self): diff --git a/test/sot/test_sot_dynamic_shape.py b/test/sot/test_sot_dynamic_shape.py index ceed37d64438a..12608d1c871e4 100644 --- a/test/sot/test_sot_dynamic_shape.py +++ b/test/sot/test_sot_dynamic_shape.py @@ -25,7 +25,7 @@ from paddle.jit.sot.utils import with_allow_dynamic_shape_guard -def foo(x): +def dynamic_shape_input_func1(x): s = x.shape[0] return x + s @@ -85,6 +85,20 @@ def test_dynamic_int_input_cache_hit_case3(self): ) self.assertEqual(ctx.translate_count, i + 1) + def test_dynamic_shape_input_cache_hit_case1(self): + with with_allow_dynamic_shape_guard( + True + ), test_instruction_translator_cache_context() as ctx: + self.assert_results( + dynamic_shape_input_func1, paddle.randn([1, 4, 5]) + ) + self.assertEqual(ctx.translate_count, 1) + for i in range(2, 6): + self.assert_results( + dynamic_shape_input_func1, paddle.randn([i, 4, 5]) + ) + self.assertEqual(ctx.translate_count, 2) + if __name__ == '__main__': unittest.main() diff --git a/test/standalone_executor/test_standalone_measure_real_op_cost.py b/test/standalone_executor/test_standalone_measure_real_op_cost.py index 9825e16e91ee6..8ee254a427d8e 100644 --- a/test/standalone_executor/test_standalone_measure_real_op_cost.py +++ b/test/standalone_executor/test_standalone_measure_real_op_cost.py @@ -112,7 +112,7 @@ def _run_op_profiling(self, place, run_profiling=True): return loss_data def _compare_loss_between(self, loss_run1, loss_run2): - s1, s2 = '%.6f' % loss_run1, '%.6f' % loss_run2 + s1, s2 = f'{loss_run1:.6f}', f'{loss_run2:.6f}' return s1 == s2 def test_op_profiling_cuda0(self): diff --git a/test/xpu/test_block_multihead_attention_op_xpu.py b/test/xpu/test_block_multihead_attention_op_xpu.py new file mode 100644 index 0000000000000..07c624c86b209 --- /dev/null +++ b/test/xpu/test_block_multihead_attention_op_xpu.py @@ -0,0 +1,585 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle +from paddle.incubate.nn.functional.block_multihead_attention import ( + block_multihead_attention_xpu, +) + +paddle.seed(2023) +np.random.seed(2023) + + +def create_attn_mask( + mask_type, + batch_size, + seq_lens, + pre_cache_length=0, +): + max_seq_len = max(seq_lens) + mask = paddle.zeros( + [batch_size, 1, max_seq_len, max_seq_len + pre_cache_length], + dtype=mask_type, + ) + mask[:, :, :, :pre_cache_length] = 1 + for i in range(batch_size): + seq_len = seq_lens[i] + mask[i, 0, :seq_len, :seq_len] = ( + paddle.tril(paddle.ones(shape=(seq_len, seq_len), dtype=mask_type)) + - 1 + ) * 1e4 + return mask + + +def naive_attention_impl( + query, + key, + value, + cache_k=None, + cache_v=None, + pre_cache_k=None, + pre_cache_v=None, + mask=None, + scale=1.0, + cache_k_dequant_scales=None, + cache_v_dequant_scales=None, + use_cachekv_int8="None", +): + batch = query.shape[0] + heads = query.shape[1] + seq_len = query.shape[2] + head_dim = query.shape[3] + kv_head = key.shape[1] + + key = key.reshape([batch, kv_head, 1, seq_len, head_dim]) + key = paddle.tile(key, [1, 1, heads // kv_head, 1, 1]) + key = key.reshape([batch, heads, seq_len, head_dim]) + + if use_cachekv_int8 == "dynamic": + unsqueeze_shape = [2, 3] + elif use_cachekv_int8 == "static": + unsqueeze_shape = [0, 2, 3] + if pre_cache_k is not None: + key = paddle.concat([pre_cache_k, key], axis=2) + if cache_k is not None: + if cache_k_dequant_scales is not None: + dequant_cache_k = ( + (cache_k.astype('float32') - 128.0) + * cache_k_dequant_scales.unsqueeze(unsqueeze_shape) + ).astype(key.dtype) + key = paddle.concat([dequant_cache_k, key], axis=2) + else: + key = paddle.concat([cache_k, key], axis=2) + + value = value.reshape([batch, kv_head, 1, seq_len, head_dim]) + value = paddle.tile(value, [1, 1, heads // kv_head, 1, 1]) + value = value.reshape([batch, heads, seq_len, head_dim]) + if pre_cache_v is not None: + value = paddle.concat([pre_cache_v, value], axis=2) + if cache_v is not None: + if cache_v_dequant_scales is not None: + dequant_cache_v = ( + (cache_v.astype('float32') - 128.0) + * cache_v_dequant_scales.unsqueeze(unsqueeze_shape) + ).astype(value.dtype) + value = paddle.concat([dequant_cache_v, value], axis=2) + else: + value = paddle.concat([cache_v, value], axis=2) + + qk_res = paddle.matmul(query, key, transpose_y=True) + attention = qk_res * scale + if mask is not None: + attention = attention + mask + softmax_result = paddle.nn.functional.softmax(attention, -1) + result = paddle.matmul(softmax_result, value) + return result + + +def get_padding_offset(bsz, max_seq_len, seq_lens_this_time): + cum_offsets_now = paddle.cumsum(max_seq_len - seq_lens_this_time) + cum_offsets = paddle.zeros(shape=(bsz + 1), dtype="int32") + cum_offsets[1:] = cum_offsets_now + token_num = paddle.sum(seq_lens_this_time) + padding_offsets = paddle.zeros(shape=(token_num), dtype="int32") + cu_seqlens_q = paddle.zeros(shape=(bsz + 1), dtype="int32") + cu_seqlens_k = paddle.zeros(shape=(bsz + 1), dtype="int32") + for i in range(bsz): + seq_len_now = seq_lens_this_time[i] + cum_offset = cum_offsets[i] + for j in range(seq_len_now): + padding_offsets[i * max_seq_len - cum_offset + j] = cum_offset + cum_seq_len = (i + 1) * max_seq_len - cum_offsets[i + 1] + cu_seqlens_q[i + 1] = cum_seq_len + cu_seqlens_k[i + 1] = cum_seq_len + return padding_offsets, cum_offsets[:-1], cu_seqlens_q, cu_seqlens_k + + +class RopeEmbedding: + def _rotary_position_embedding(self, seq_len, head_dim, dtype): + pos_seq = paddle.arange(0, seq_len, 1, dtype=dtype) + indices = paddle.arange(0, head_dim, 2, dtype=dtype) + indices = 1 / 10000 ** (indices / head_dim) + + sinusoid_inp = pos_seq.unsqueeze(1) * indices.unsqueeze(0) + pos_emb = paddle.concat( + [paddle.sin(sinusoid_inp), paddle.cos(sinusoid_inp)], axis=-1 + ) + pos_emb = paddle.reshape(pos_emb, (1, 1, seq_len, head_dim)) + pos_emb.stop_gradient = True + return pos_emb + + def _apply_rope(self, rp, q, k, v=None): + # sin [sequence_length, embed_size_per_head//2] + # cos [sequence_length, embed_size_per_head//2] + sin, cos = paddle.chunk(rp, 2, axis=-1) + # sin [θ0,θ1,θ2......θd/2-1] -> sin_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1] + sin_pos = paddle.reshape(paddle.stack([sin, sin], axis=-1), rp.shape) + # cos [θ0,θ1,θ2......θd/2-1] -> cos_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1] + cos_pos = paddle.reshape(paddle.stack([cos, cos], axis=-1), rp.shape) + # rotate_half_query_layer [-q1,q0,-q3,q2......,-qd-1,qd-2] + rotate_half_q = paddle.reshape( + paddle.stack([-q[:, :, :, 1::2], q[:, :, :, 0::2]], axis=-1), + paddle.shape(q), + ) + query = paddle.add( + paddle.multiply(q, cos_pos), paddle.multiply(rotate_half_q, sin_pos) + ) + # rotate_half_key_layer [-k1,k0,-k3,k2......,-kd-1,kd-2] + rotate_half_k = paddle.reshape( + paddle.stack([-k[:, :, :, 1::2], k[:, :, :, 0::2]], axis=-1), + paddle.shape(k), + ) + key = paddle.add( + paddle.multiply(k, cos_pos), paddle.multiply(rotate_half_k, sin_pos) + ) + if v is not None: + # rotate_half_value_layer [-v1,v0,-v3,v2......,-vd-1,vd-2] + rotate_half_v = paddle.reshape( + paddle.stack([-v[:, :, :, 1::2], v[:, :, :, 0::2]], axis=-1), + paddle.shape(v), + ) + value = paddle.add( + paddle.multiply(v, cos_pos), + paddle.multiply(rotate_half_v, sin_pos), + ) + return query, key, value + return query, key + + def _apply_neox_rope(self, rp, q, k, v=None): + # sin [bs, sequence_length, embed_size_per_head//2] + # cos [bs, sequence_length, embed_size_per_head//2] + sin, cos = paddle.chunk(rp, 2, axis=-1) + + # sin [θ0,θ1,θ2......θd/2-1] -> sin_pos [θ0,θ1,θ2......θd/2-1, θ0,θ1,θ2......θd/2-1] + sin_pos = paddle.concat([sin, sin], axis=-1).squeeze(0).unsqueeze(1) + # cos [θ0,θ1,θ2......θd/2-1] -> cos_pos [θ0,θ1,θ2......θd/2-1, θ0,θ1,θ2......θd/2-1] + cos_pos = paddle.concat([cos, cos], axis=-1).squeeze(0).unsqueeze(1) + rotate_half_q = paddle.reshape( + paddle.concat( + [-q[:, :, :, sin.shape[-1] :], q[:, :, :, 0 : sin.shape[-1]]], + axis=-1, + ), + paddle.shape(q), + ) + query = paddle.add( + paddle.multiply(q, cos_pos), paddle.multiply(rotate_half_q, sin_pos) + ) + rotate_half_k = paddle.reshape( + paddle.concat( + [-k[:, :, :, sin.shape[-1] :], k[:, :, :, 0 : sin.shape[-1]]], + axis=-1, + ), + paddle.shape(k), + ) + key = paddle.add( + paddle.multiply(k, cos_pos), paddle.multiply(rotate_half_k, sin_pos) + ) + if v is not None: + rotate_half_v = paddle.reshape( + paddle.concat( + [ + -v[:, :, :, sin.shape[-1] :], + v[:, :, :, 0 : sin.shape[-1]], + ], + axis=-1, + ), + paddle.shape(v), + ) + value = paddle.add( + paddle.multiply(v, cos_pos), + paddle.multiply(rotate_half_v, sin_pos), + ) + return query, key, value + return query, key + + +def remove_padding(seq_lens, cu_seq_lens, inputs, token_num): + bsz, num_head, seq_len, dim_head = inputs.shape + output = paddle.zeros( + shape=[token_num, num_head * dim_head], dtype=inputs.dtype + ) + inputs = inputs.transpose([0, 2, 1, 3]).reshape([bsz, seq_len, -1]) + for i in range(bsz): + seq_len_now = seq_lens[i] + start_idx = cu_seq_lens[i] + end_idx = cu_seq_lens[i + 1] + output[start_idx:end_idx, :] = inputs[i, :seq_len_now, :] + return output + + +def block_cache_to_naive_cache( + cache_k, cache_v, bsz, block_tables, cache_seq_len +): + _, num_head, blocksize, dim_head = cache_k.shape + out_cache_k = paddle.zeros( + shape=[bsz, num_head, cache_seq_len, dim_head], dtype=cache_k.dtype + ) + out_cache_v = paddle.zeros( + shape=[bsz, num_head, cache_seq_len, dim_head], dtype=cache_v.dtype + ) + for i in range(bsz): + for j in range(cache_seq_len): + out_cache_k[i, :, j, :] = cache_k[ + block_tables[i, j // blocksize], :, j % blocksize, : + ] + out_cache_v[i, :, j, :] = cache_v[ + block_tables[i, j // blocksize], :, j % blocksize, : + ] + return out_cache_k, out_cache_v + + +class TestBlockMultiHeadAttnRoPEXPU(unittest.TestCase): + def setUp(self): + paddle.disable_static() + self.name = "TestBlockMultiHeadAttnRoPE" + self.place = paddle.XPUPlace(0) + self.batch_size = 2 + self.num_head = 8 + self.seq_len = 64 + self.max_dec_len = 64 + self.dim_head = 64 + self.hid_dim = self.num_head * self.dim_head + self.blocksize = 64 + self.block_num_per_seq = ( + self.seq_len + self.max_dec_len + self.blocksize - 1 + ) // self.blocksize + self.rope = RopeEmbedding() + self.max_block_num = self.block_num_per_seq * self.batch_size + self.free_list = list(range(self.max_block_num - 1, -1, -1)) + self.seq_lens_encoder = paddle.to_tensor( + [ + self.seq_len, + ] + * self.batch_size, + "int32", + ) + self.seq_lens_decoder = paddle.to_tensor( + [ + 0, + ] + * self.batch_size, + "int32", + ) + self.seq_lens_this_time = paddle.to_tensor( + [ + self.seq_len, + ] + * self.batch_size, + "int32", + ) + self.shape = ( + self.batch_size, + self.num_head, + self.seq_len, + self.dim_head, + ) + self.cache_shape = ( + self.max_block_num, + self.num_head, + self.blocksize, + self.dim_head, + ) + self.dtype = 'float16' + self.attention_mask = create_attn_mask( + self.dtype, + self.batch_size, + [ + self.seq_len, + ] + * self.batch_size, + ) + self.scale = 1.0 / np.sqrt(self.shape[-1]) + self.cache_k = paddle.zeros(shape=self.cache_shape, dtype=self.dtype) + self.cache_v = paddle.zeros(shape=self.cache_shape, dtype=self.dtype) + self.block_tables = paddle.zeros( + shape=(self.batch_size, self.block_num_per_seq), dtype="int32" + ) + self.cache_k_per_batch_maxs = paddle.zeros( + [self.batch_size, 6], dtype="float32" + ) + self.cache_v_per_batch_maxs = paddle.zeros( + [self.batch_size, 6], dtype="float32" + ) + for i in range(self.batch_size): + need_block_num = ( + self.seq_len + self.max_dec_len + self.blocksize - 1 + ) // self.blocksize + for j in range(need_block_num): + self.block_tables[i, j] = self.free_list.pop() + ( + self.padding_offset, + self.cum_offset, + self.cu_seqlens_q, + self.cu_seqlens_k, + ) = get_padding_offset( + self.batch_size, self.seq_len, self.seq_lens_this_time + ) + self.token_num = self.padding_offset.shape[0] + + def get_rotary_position_embedding(self, position_ids, head_dim): + bsz, max_seq_len = position_ids.shape[:2] + rot_emb = paddle.zeros( + (2, bsz, max_seq_len, 1, head_dim), dtype="float32" + ) + inv_freq = 10000 ** ( + -paddle.arange(0, head_dim, 2, dtype="float32") / head_dim + ) + + # shape: [B, S, D/2] + freqs = paddle.einsum( + "ij,k->ijk", position_ids.cast("float32"), inv_freq + ) + # shape: [B, S, D] + emb = paddle.concat([freqs, freqs], axis=-1).reshape( + (bsz, max_seq_len, head_dim) + ) + # emb = paddle.stack([freqs], axis=-1).reshape( + # (bsz, max_seq_len, head_dim // 2) + # ) + # shape: [B, S, 1, D] + emb = paddle.unsqueeze(emb, 2) + + rot_emb[0] = paddle.cos(emb) + rot_emb[1] = paddle.sin(emb) + return rot_emb + + def test_all(self): + paddle.disable_static() + tmp_position_ids = paddle.arange( + self.seq_len + self.max_dec_len + ).reshape((1, -1)) + self.rope_emb = self.get_rotary_position_embedding( + tmp_position_ids, self.dim_head + ) + # encoder + query = np.random.random(self.shape) + q = paddle.to_tensor( + query, place=self.place, dtype=self.dtype, stop_gradient=False + ) + key = np.random.random(self.shape) + k = paddle.to_tensor( + key, place=self.place, dtype=self.dtype, stop_gradient=False + ) + value = np.random.random(self.shape) + v = paddle.to_tensor( + value, place=self.place, dtype=self.dtype, stop_gradient=False + ) + qkv = paddle.stack( + [ + q.transpose([0, 2, 1, 3]).reshape( + [self.token_num, self.hid_dim] + ), + k.transpose([0, 2, 1, 3]).reshape( + [self.token_num, self.hid_dim] + ), + v.transpose([0, 2, 1, 3]).reshape( + [self.token_num, self.hid_dim] + ), + ], + axis=1, + ).reshape([self.token_num, -1]) + sinusoidal_pos = self.rope._rotary_position_embedding( + self.seq_len, self.dim_head, "float32" + ) + q, k = self.rope._apply_neox_rope( + sinusoidal_pos.astype("float16"), q, k + ) + + out_ = naive_attention_impl( + q, k, v, None, None, None, None, self.attention_mask, self.scale + ) + out_ = remove_padding( + self.seq_lens_this_time, self.cu_seqlens_q, out_, self.token_num + ) + out = block_multihead_attention_xpu( + qkv, + self.cache_k, + self.cache_v, + self.seq_lens_encoder, + self.seq_lens_decoder, + self.seq_lens_this_time, + self.padding_offset, + self.cum_offset, + self.cu_seqlens_q, + self.cu_seqlens_k, + self.block_tables, + self.cache_k_per_batch_maxs, + self.cache_v_per_batch_maxs, + None, # pre_key_cache + None, # pre_value_cache + None, # cache_k_quant_scales + None, # cache_v_quant_scales + None, # cache_k_dequant_scales + None, # cache_v_dequant_scales + None, # qkv_out_scale + None, # qkv_bias + None, # out_shift + None, # out_smooth + None, # max_enc_len_this_time + None, # max_dec_len_this_time + self.rope_emb, # rotary_embs + None, # attn_mask + None, # tgt_mask + self.seq_len, + self.blocksize, + True, # use_neox_rotary_style + )[0] + np.testing.assert_allclose( + out.numpy(), + out_.numpy(), + rtol=5e-03, + atol=1e-03, + ) + + # decoder + naive_cache_k, naive_cache_v = block_cache_to_naive_cache( + self.cache_k, + self.cache_v, + self.batch_size, + self.block_tables, + self.seq_len, + ) + + self.seq_lens_decoder = self.seq_lens_encoder.clone() + self.seq_lens_encoder[:] = paddle.zeros_like(self.seq_lens_encoder) + self.seq_lens_this_time[:] = 1 + self.shape = ( + self.batch_size, + self.num_head, + 1, + self.dim_head, + ) + query = np.random.random(self.shape) + q = paddle.to_tensor( + query, place=self.place, dtype=self.dtype, stop_gradient=False + ) + key = np.random.random(self.shape) + k = paddle.to_tensor( + key, place=self.place, dtype=self.dtype, stop_gradient=False + ) + value = np.random.random(self.shape) + v = paddle.to_tensor( + value, place=self.place, dtype=self.dtype, stop_gradient=False + ) + + qkv = paddle.stack( + [ + q.transpose([0, 2, 1, 3]).reshape( + [self.batch_size, self.hid_dim] + ), + k.transpose([0, 2, 1, 3]).reshape( + [self.batch_size, self.hid_dim] + ), + v.transpose([0, 2, 1, 3]).reshape( + [self.batch_size, self.hid_dim] + ), + ], + axis=1, + ).reshape([self.batch_size, -1]) + + sinusoidal_pos = self.rope._rotary_position_embedding( + self.seq_len + 1, self.dim_head, "float32" + )[:, :, -1:, :] + q, k = self.rope._apply_neox_rope( + sinusoidal_pos.astype("float16"), q, k + ) + ( + self.padding_offset, + self.cum_offset, + self.cu_seqlens_q, + self.cu_seqlens_k, + ) = get_padding_offset(self.batch_size, 1, self.seq_lens_this_time) + out_ = ( + naive_attention_impl( + q, + k, + v, + naive_cache_k, + naive_cache_v, + None, + None, + None, + self.scale, + ) + .transpose([0, 2, 1, 3]) + .reshape([self.batch_size, -1]) + ) + out = block_multihead_attention_xpu( + qkv, + self.cache_k, + self.cache_v, + self.seq_lens_encoder, + self.seq_lens_decoder, + self.seq_lens_this_time, + self.padding_offset, + self.cum_offset, + self.cu_seqlens_q, + self.cu_seqlens_k, + self.block_tables, + self.cache_k_per_batch_maxs, + self.cache_v_per_batch_maxs, + None, # pre_key_cache + None, # pre_value_cache + None, # cache_k_quant_scales + None, # cache_v_quant_scales + None, # cache_k_dequant_scales + None, # cache_v_dequant_scales + None, # qkv_out_scale + None, # qkv_bias + None, # out_shift + None, # out_smooth + None, # max_enc_len_this_time + None, # max_dec_len_this_time + self.rope_emb, # rotary_embs + None, # attn_mask + None, # tgt_mask + 1, # seq_len, + self.blocksize, + True, # use_neox_rotary_style + )[0] + # NOTE: The diff of decoder is a little big + np.testing.assert_allclose( + out.numpy(), + out_.numpy(), + rtol=5e-02, + atol=5e-02, + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/xpu/test_collective_api_base.py b/test/xpu/test_collective_api_base.py index 0c3d710a06335..c94061d5fc6d1 100644 --- a/test/xpu/test_collective_api_base.py +++ b/test/xpu/test_collective_api_base.py @@ -202,7 +202,7 @@ def setUp(self): self._trainers = 2 self._ps_endpoints = f"127.0.0.1:{self._find_free_port()},127.0.0.1:{self._find_free_port()}" self._python_interp = sys.executable - self._master_endpoints = "127.0.0.1:%s" % (self._find_free_port()) + self._master_endpoints = f"127.0.0.1:{self._find_free_port()}" self.temp_dir = tempfile.TemporaryDirectory() @@ -300,15 +300,15 @@ def _run_cluster(self, model_file, envs): tr0_out, tr0_err = tr0_proc.communicate() tr1_out, tr1_err = tr1_proc.communicate() - sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err) - sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err) + sys.stderr.write(f'trainer 0 stderr: {tr0_err}\n') + sys.stderr.write(f'trainer 1 stderr: {tr1_err}\n') # close trainer file tr0_pipe.close() tr1_pipe.close() with open(path0, "r") as f: - sys.stderr.write('trainer 0 stderr file: %s\n' % f.read()) + sys.stderr.write(f'trainer 0 stderr file: {f.read()}\n') with open(path1, "r") as f: - sys.stderr.write('trainer 1 stderr file: %s\n' % f.read()) + sys.stderr.write(f'trainer 1 stderr file: {f.read()}\n') def load_and_remove(path): with open(path, 'rb') as f: diff --git a/test/xpu/test_collective_base_xpu.py b/test/xpu/test_collective_base_xpu.py index 8a3289f0eb02a..c6cd081b498d7 100644 --- a/test/xpu/test_collective_base_xpu.py +++ b/test/xpu/test_collective_base_xpu.py @@ -244,8 +244,8 @@ def _run_cluster(self, model_file, envs): tr0_out, tr0_err = tr0_proc.communicate() tr1_out, tr1_err = tr1_proc.communicate() - sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err) - sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err) + sys.stderr.write(f'trainer 0 stderr: {tr0_err}\n') + sys.stderr.write(f'trainer 1 stderr: {tr1_err}\n') # close trainer file tr0_pipe.close() tr1_pipe.close() diff --git a/test/xpu/test_conv2d_op_xpu.py b/test/xpu/test_conv2d_op_xpu.py index df36f226408eb..4c7419ae9e5fd 100644 --- a/test/xpu/test_conv2d_op_xpu.py +++ b/test/xpu/test_conv2d_op_xpu.py @@ -36,14 +36,14 @@ def conv2d_forward_naive( ): if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]: raise ValueError( - "Unknown Attr(padding_algorithm): '%s'. " - "It can only be 'SAME' or 'VALID'." % str(padding_algorithm) + f"Unknown Attr(padding_algorithm): '{str(padding_algorithm)}'. " + "It can only be 'SAME' or 'VALID'." ) if data_format not in ["NCHW", "NHWC"]: raise ValueError( - "Unknown Attr(data_format): '%s' ." - "It can only be 'NCHW' or 'NHWC'." % str(data_format) + f"Unknown Attr(data_format): '{str(data_format)}' ." + "It can only be 'NCHW' or 'NHWC'." ) channel_last = data_format == "NHWC" diff --git a/test/xpu/test_conv2d_transpose_op_xpu.py b/test/xpu/test_conv2d_transpose_op_xpu.py index 57c564335fbc1..1728889827992 100644 --- a/test/xpu/test_conv2d_transpose_op_xpu.py +++ b/test/xpu/test_conv2d_transpose_op_xpu.py @@ -31,8 +31,8 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs): padding_algorithm = attrs['padding_algorithm'] if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]: raise ValueError( - "Unknown Attr(padding_algorithm): '%s'. " - "It can only be 'SAME' or 'VALID'." % str(padding_algorithm) + f"Unknown Attr(padding_algorithm): '{str(padding_algorithm)}'. " + "It can only be 'SAME' or 'VALID'." ) if attrs['data_format'] == 'NHWC': diff --git a/test/xpu/test_conv3d_op_xpu.py b/test/xpu/test_conv3d_op_xpu.py index 021c57821c12d..26582b4e1b2c5 100644 --- a/test/xpu/test_conv3d_op_xpu.py +++ b/test/xpu/test_conv3d_op_xpu.py @@ -31,14 +31,14 @@ def conv3d_forward_naive( ): if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]: raise ValueError( - "Unknown Attr(padding_algorithm): '%s'. " - "It can only be 'SAME' or 'VALID'." % str(padding_algorithm) + f"Unknown Attr(padding_algorithm): '{str(padding_algorithm)}'. " + "It can only be 'SAME' or 'VALID'." ) if data_format not in ["NCDHW", "NDHWC"]: raise ValueError( - "Unknown Attr(data_format): '%s' ." - "It can only be 'NCDHW' or 'NDHWC'." % str(data_format) + f"Unknown Attr(data_format): '{str(data_format)}' ." + "It can only be 'NCDHW' or 'NDHWC'." ) channel_last = data_format == "NDHWC" diff --git a/test/xpu/test_depthwise_conv2d_transpose_op_xpu.py b/test/xpu/test_depthwise_conv2d_transpose_op_xpu.py index 96077ae8c83d0..878519fbd507d 100644 --- a/test/xpu/test_depthwise_conv2d_transpose_op_xpu.py +++ b/test/xpu/test_depthwise_conv2d_transpose_op_xpu.py @@ -31,8 +31,8 @@ def depthwiseconv2dtranspose_forward_naive(input_, filter_, attrs): padding_algorithm = attrs['padding_algorithm'] if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]: raise ValueError( - "Unknown Attr(padding_algorithm): '%s'. " - "It can only be 'SAME' or 'VALID'." % str(padding_algorithm) + f"Unknown Attr(padding_algorithm): '{str(padding_algorithm)}'. " + "It can only be 'SAME' or 'VALID'." ) if attrs['data_format'] == 'NHWC': diff --git a/test/xpu/test_parallel_dygraph_dataparallel.py b/test/xpu/test_parallel_dygraph_dataparallel.py index 0070f8ade9802..3eed21553b7a5 100644 --- a/test/xpu/test_parallel_dygraph_dataparallel.py +++ b/test/xpu/test_parallel_dygraph_dataparallel.py @@ -73,9 +73,11 @@ def start_local_trainers( for t in pod.trainers: proc_env = { "PADDLE_DISTRI_BACKEND": "bkcl", - "FLAGS_selected_xpus": "%s" % ",".join([str(g) for g in t.gpus]), + "FLAGS_selected_xpus": "{}".format( + ",".join([str(g) for g in t.gpus]) + ), "PADDLE_TRAINER_ID": "%d" % t.rank, - "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint, + "PADDLE_CURRENT_ENDPOINT": f"{t.endpoint}", "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()), } diff --git a/test/xpu/test_pool2d_op_xpu.py b/test/xpu/test_pool2d_op_xpu.py index f62ffb4fc45a6..1d3c1def63bfb 100644 --- a/test/xpu/test_pool2d_op_xpu.py +++ b/test/xpu/test_pool2d_op_xpu.py @@ -172,8 +172,8 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride): padding_algorithm = padding_algorithm.upper() if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]: raise ValueError( - "Unknown Attr(padding_algorithm): '%s'. " - "It can only be 'SAME' or 'VALID'." % str(padding_algorithm) + f"Unknown Attr(padding_algorithm): '{str(padding_algorithm)}'. " + "It can only be 'SAME' or 'VALID'." ) if padding_algorithm == "VALID": diff --git a/test/xpu/test_pool3d_op_xpu.py b/test/xpu/test_pool3d_op_xpu.py index 865029ad0d07d..01dd6d77b2b86 100644 --- a/test/xpu/test_pool3d_op_xpu.py +++ b/test/xpu/test_pool3d_op_xpu.py @@ -68,8 +68,8 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride): padding_algorithm = padding_algorithm.upper() if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]: raise ValueError( - "Unknown Attr(padding_algorithm): '%s'. " - "It can only be 'SAME' or 'VALID'." % str(padding_algorithm) + f"Unknown Attr(padding_algorithm): '{str(padding_algorithm)}'. " + "It can only be 'SAME' or 'VALID'." ) if padding_algorithm == "VALID": diff --git a/test/xpu/test_swiglu_op_xpu.py b/test/xpu/test_swiglu_op_xpu.py new file mode 100644 index 0000000000000..35d8350c85e26 --- /dev/null +++ b/test/xpu/test_swiglu_op_xpu.py @@ -0,0 +1,172 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle +import paddle.nn.functional as F +from paddle.incubate.nn.functional import swiglu as fused_swiglu_impl + + +def swiglu(x, y, out_grad): + if isinstance(x, np.ndarray): + x = paddle.to_tensor(x) + y = paddle.to_tensor(y) + out_grad = paddle.to_tensor(out_grad) + + origin_x = x.detach().clone() + origin_x.stop_gradient = False + x = origin_x + + origin_y = y.detach().clone() + origin_y.stop_gradient = False + y = origin_y + + dtype = x.dtype + need_convert = False + assert dtype == y.dtype + output_dtype = dtype + + out = F.silu(x) * y + if need_convert: + out = out.astype(dtype) + out.backward(out_grad) + ret = [ + out.astype(output_dtype), + origin_x.grad.astype(output_dtype), + origin_y.grad.astype(output_dtype), + ] + return ret + + +def fused_swiglu(x, y, out_grad): + x = x.detach().clone() + x.stop_gradient = False + if y is not None: + y = y.detach().clone() + y.stop_gradient = False + out = fused_swiglu_impl(x, y) + out.backward(out_grad) + + output_dtype = x.dtype + ret = [ + out.astype(output_dtype), + ] + if y is not None: + x_grad, y_grad = x.grad, y.grad + else: + x_grad, y_grad = paddle.split(x.grad, 2, axis=-1) + + ret.append(x_grad.astype(output_dtype)) + ret.append(y_grad.astype(output_dtype)) + return ret + + +tol_map = { + paddle.float64: [1e-8, 1e-8], + paddle.float32: [1e-6, 1e-6], + paddle.float16: [1e-3, 1e-3], + paddle.bfloat16: [1e-2, 1e-2], +} + + +class TestSwiGLUDygraph(unittest.TestCase): + def setUp(self): + self.init_case() + self.seed = 1234 + + def init_case(self): + self.shape = [] + self.shape.append([8, 100]) + self.shape.append([4, 102]) + + def check_dygraph_impl(self, device, shape, dtype): + x = paddle.randn(shape, dtype=dtype) + y = paddle.randn(shape, dtype=dtype) + out_grad = paddle.randn(shape, dtype=dtype) + + ret1 = swiglu(x, y, out_grad) + ret2 = fused_swiglu(x, y, out_grad) + ret3 = fused_swiglu(paddle.concat([x, y], axis=-1), None, out_grad) + + atol, rtol = tol_map[dtype] + err_msg = ( + f"Failed when device = {device}, dtype = {dtype}, shape = {shape}" + ) + for t1, t2, t3 in zip(ret1, ret2, ret3): + t1, t2, t3 = t1.numpy(), t2.numpy(), t3.numpy() + np.testing.assert_allclose( + t1, t2, atol=atol, rtol=rtol, err_msg=err_msg + ) + np.testing.assert_equal(t2, t3, err_msg=err_msg) + + def check_dygraph(self, shape): + metas = [] + metas.append(('xpu', paddle.float32)) + metas.append(('xpu', paddle.float64)) + # Enable in KL3 + # metas.append(('xpu', paddle.float16)) + # metas.append(('xpu', paddle.bfloat16)) + + for device, dtype in metas: + origin_device = paddle.get_device() + paddle.set_device(device) + for with_split in [True]: + self.check_dygraph_impl(device, shape, dtype) + paddle.set_device(origin_device) + + def check_static_graph(self, shape, dtype="float32"): + x = paddle.static.data(name='x', shape=shape, dtype=dtype) + y = paddle.static.data(name='y', shape=shape, dtype=dtype) + concated_x = paddle.static.data( + name='concated_x', + shape=list(shape[:-1]) + [shape[-1] * 2], + dtype=dtype, + ) + out1 = fused_swiglu_impl(x, y) + out2 = fused_swiglu_impl(concated_x) + + concated_x_np = np.random.random(concated_x.shape).astype(dtype) + x_np, y_np = np.split(concated_x_np, 2, axis=-1) + + exe = paddle.static.Executor() + t1, t2 = exe.run( + feed={'x': x_np, 'y': y_np, 'concated_x': concated_x_np}, + fetch_list=[out1, out2], + ) + np.testing.assert_equal(out1, out2) + + def check_main(self, shape): + self.check_dygraph(shape) + paddle.enable_static() + with paddle.static.program_guard( + paddle.static.Program(), paddle.static.Program() + ): + self.check_static_graph(shape) + paddle.disable_static() + + def test_main(self): + for i in self.shape: + self.check_main(i) + + +class TestSwigluOp(TestSwiGLUDygraph): + def init_case(self): + self.shape = [[1, 4096, 1376], [1, 4096, 11008]] + + +if __name__ == "__main__": + unittest.main() diff --git a/test/xpu/test_zero_dim_tensor_xpu.py b/test/xpu/test_zero_dim_tensor_xpu.py index 133c9b1302013..ac5e2df75b46f 100644 --- a/test/xpu/test_zero_dim_tensor_xpu.py +++ b/test/xpu/test_zero_dim_tensor_xpu.py @@ -345,7 +345,7 @@ def test_dygraph_binary(self): # 1) x is 0D, y is 0D x_np = np.random.randint(-10, 10, []) y_np = np.random.randint(-10, 10, []) - out_np = eval('np.%s(x_np, y_np)' % api.__name__) + out_np = eval(f'np.{api.__name__}(x_np, y_np)') x = paddle.to_tensor(x_np) y = paddle.to_tensor(y_np) @@ -357,7 +357,7 @@ def test_dygraph_binary(self): # 2) x is ND, y is 0D x_np = np.random.randint(-10, 10, [3, 5]) y_np = np.random.randint(-10, 10, []) - out_np = eval('np.%s(x_np, y_np)' % api.__name__) + out_np = eval(f'np.{api.__name__}(x_np, y_np)') x = paddle.to_tensor(x_np) y = paddle.to_tensor(y_np) @@ -369,7 +369,7 @@ def test_dygraph_binary(self): # 3) x is 0D , y is ND x_np = np.random.randint(-10, 10, []) y_np = np.random.randint(-10, 10, [3, 5]) - out_np = eval('np.%s(x_np, y_np)' % api.__name__) + out_np = eval(f'np.{api.__name__}(x_np, y_np)') x = paddle.to_tensor(x_np) y = paddle.to_tensor(y_np) diff --git a/third_party/onednn b/third_party/onednn index 01204edbda1c2..0fb7e6ed4f32e 160000 --- a/third_party/onednn +++ b/third_party/onednn @@ -1 +1 @@ -Subproject commit 01204edbda1c2a4ff0cccd40476ed6bd2fb62d56 +Subproject commit 0fb7e6ed4f32e5d89832b2bd742bbf834cd296ed diff --git a/tools/CheckPRTemplate.py b/tools/CheckPRTemplate.py index 1cc601dba0a29..a3a350d107af6 100644 --- a/tools/CheckPRTemplate.py +++ b/tools/CheckPRTemplate.py @@ -79,7 +79,7 @@ def parameter_accuracy(body): for i in value: i = i.strip().lower() if i not in test_list_lower: - single_mess += '%s.' % i + single_mess += f'{i}.' if len(single_mess) != 0: message += f'{key} should be in {test_list}. but now is [{single_mess}].' return message diff --git a/tools/CrossStackProfiler/CspFileReader.py b/tools/CrossStackProfiler/CspFileReader.py index 28038b5c76d3b..5802edc965cca 100755 --- a/tools/CrossStackProfiler/CspFileReader.py +++ b/tools/CrossStackProfiler/CspFileReader.py @@ -108,7 +108,7 @@ def printArgs(self): def _checkArgsKey(self, key, type): if key not in self._args: - raise KeyError("args should has key [%s]!" % key) + raise KeyError(f"args should has key [{key}]!") if not isinstance(self._args[key], type): raise TypeError( @@ -130,17 +130,14 @@ def _checkArgs(self): or self._organizeForm == FILEORGANIZEFORM_BYOTHER ): raise NotImplementedError( - "we have not known how to process this form of file [%s]!" - % self._organizeForm + f"we have not known how to process this form of file [{self._organizeForm}]!" ) self._checkArgsKey("gpuPerTrainer", int) self._checkArgsKey("dataPath", str) if not os.path.exists(self._dataPath): - raise OSError( - "input data path [%s] not existed!" % (self._dataPath) - ) + raise OSError(f"input data path [{self._dataPath}] not existed!") self._checkArgsKey("groupSize", int) self._checkArgsKey("displaySize", int) @@ -183,8 +180,7 @@ def _getFileList(self): newFileList.append(file) else: raise NotImplementedError( - "[%s] is repeated by id, we don not how to process it!" - % file + f"[{file}] is repeated by id, we don not how to process it!" ) if not self._fileList: @@ -201,7 +197,7 @@ def _sortBySuffix(elem): if not self._fileList: self._logger.warning( - "we can not find any file in dir [%s]!" % self._dataPath + f"we can not find any file in dir [{self._dataPath}]!" ) else: self._logger.info( @@ -215,12 +211,11 @@ def _sortBySuffix(elem): def _getId(self, fileName, organizeForm, sed="."): if self._organizeForm != organizeForm: raise TypeError( - "Can not get rank id when organizer form is not %s!" - % organizeForm + f"Can not get rank id when organizer form is not {organizeForm}!" ) if not os.path.isfile(fileName): - raise OSError("[%s] is not a valid file!" % (fileName)) + raise OSError(f"[{fileName}] is not a valid file!") try: prefix_str = fileName.split(sed)[-1] @@ -228,13 +223,12 @@ def _getId(self, fileName, organizeForm, sed="."): return int(prefix_str) except ValueError as e: print(e) - raise TypeError("invalid fileName [%s]" % fileName) + raise TypeError(f"invalid fileName [{fileName}]") except IndexError as e: print(e) raise TypeError( - "invalid fileName [%s], the prefix should be a number!" - % fileName + f"invalid fileName [{fileName}], the prefix should be a number!" ) def getRankId(self, fileName, sed="."): @@ -298,19 +292,15 @@ def getDcgmInfoDict(self, groupId, gpuId, tmpPath="./tmp"): def getDict(self, name, groupId, gpuId, tmpPath="./tmp"): fileName = self.getFileName(name, groupId, gpuId, tmpPath) if not os.path.isfile(fileName): - raise OSError("[%s] is not existed!" % fileName) + raise OSError(f"[{fileName}] is not existed!") data = {} with open(fileName, "r") as rf: try: data = json.load(rf) except Exception: - self._logger.error( - "read [%s] error. not a json file!" % (fileName) - ) - raise TypeError( - "read [%s] error. not a json file!" % (fileName) - ) + self._logger.error(f"read [{fileName}] error. not a json file!") + raise TypeError(f"read [{fileName}] error. not a json file!") return data def dumpOpInfoDict( @@ -344,7 +334,7 @@ def dumpDict( fileObject = open(fileName, 'w') fileObject.write(jsObj) fileObject.close() - self._logger.info("dump [%s] successfully!" % fileName) + self._logger.info(f"dump [{fileName}] successfully!") def getLogger(): diff --git a/tools/CrossStackProfiler/DCGMFileReader.py b/tools/CrossStackProfiler/DCGMFileReader.py index f462ce5c9ad5e..eb31ad7820a78 100755 --- a/tools/CrossStackProfiler/DCGMFileReader.py +++ b/tools/CrossStackProfiler/DCGMFileReader.py @@ -88,7 +88,7 @@ def parseFileByGroup(self, groupId, processNum=8): def _parseTask(self, taskList, q=None): is_first = True for fileName in taskList: - self._logger.info("I am processing %s!" % fileName) + self._logger.info(f"I am processing {fileName}!") tmp_data = self._parseSingleFile(fileName) if tmp_data is None: continue @@ -103,7 +103,7 @@ def _parseTask(self, taskList, q=None): dcgm_data = dcgm_data.dropna() if q is not None: q.put(dcgm_data) - self._logger.info("I finish processing %s!" % fileName) + self._logger.info(f"I finish processing {fileName}!") return dcgm_data def _parseSingleFile(self, fileName): @@ -192,7 +192,7 @@ def _getDCGMTraceInfoByGpuId( di = {} # name = "%s_%d" % (metric, trainerId) - name = "%s" % (metric) + name = f"{metric}" di['name'] = name di['pid'] = pid_map[metric] di['ts'] = self._align_ts(int(row['ts'])) diff --git a/tools/CrossStackProfiler/ProfileFileReader.py b/tools/CrossStackProfiler/ProfileFileReader.py index af955bd6652c4..266e9e5cf706d 100755 --- a/tools/CrossStackProfiler/ProfileFileReader.py +++ b/tools/CrossStackProfiler/ProfileFileReader.py @@ -46,7 +46,7 @@ def _parseTask(self, taskList, q=None): profile_dict["trainerRank.%03d" % (rankId)] = self._parseSingleFile( fileName ) - self._logger.info("I finish processing %s!" % fileName) + self._logger.info(f"I finish processing {fileName}!") if q is not None: q.put(profile_dict) diff --git a/tools/analysisPyXml.py b/tools/analysisPyXml.py index 2f2d8b472c566..9d9ec062180cb 100644 --- a/tools/analysisPyXml.py +++ b/tools/analysisPyXml.py @@ -31,7 +31,7 @@ def analysisPyXml(rootPath, ut): for clazz in root.findall('packages/package/classes/class'): clazz_filename = clazz.attrib.get('filename') if not clazz_filename.startswith('/paddle'): - clazz_filename = '/paddle/%s' % clazz_filename + clazz_filename = f'/paddle/{clazz_filename}' for line in clazz.findall('lines/line'): line_hits = int(line.attrib.get('hits')) if line_hits != 0: diff --git a/tools/analysis_build_time.py b/tools/analysis_build_time.py index 6ae3ee6bbacc1..ae340a1bcfe03 100644 --- a/tools/analysis_build_time.py +++ b/tools/analysis_build_time.py @@ -33,10 +33,10 @@ def getUsefulBuildTimeFile(filename): def analysisBuildTime(): - filename = '%s/build/build-time' % root_path + filename = f'{root_path}/build/build-time' getUsefulBuildTimeFile(filename) - os.system('rm -rf %s/tools/tempbuildTime.txt' % root_path) - with open('%s/tools/analysis_build_time' % root_path, 'r') as f: + os.system(f'rm -rf {root_path}/tools/tempbuildTime.txt') + with open(f'{root_path}/tools/analysis_build_time', 'r') as f: lines = f.readlines() for line in lines: try: diff --git a/tools/check_added_ut.sh b/tools/check_added_ut.sh index b036c08e1d93e..6d422774d12ed 100644 --- a/tools/check_added_ut.sh +++ b/tools/check_added_ut.sh @@ -1,13 +1,13 @@ #!/bin/bash # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -32,7 +32,7 @@ if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then cp $PADDLE_ROOT/paddle/scripts/paddle_build.sh $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh elif [[ "$SYSTEM" == "Windows_NT" ]];then git remote | grep upstream - if [ $? != 0 ]; then + if [ $? != 0 ]; then git remote add upstream https://github.com/PaddlePaddle/Paddle.git fi git fetch upstream ${BRANCH} diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh index 4a8e7cf708994..c32e3c99f45a9 100644 --- a/tools/check_api_approvals.sh +++ b/tools/check_api_approvals.sh @@ -40,12 +40,18 @@ function add_failed(){ api_params_diff=`python ${PADDLE_ROOT}/tools/check_api_compatible.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec ${PADDLE_ROOT}/paddle/fluid/API_PR.spec` api_spec_diff=`python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec.api ${PADDLE_ROOT}/paddle/fluid/API_PR.spec.api` +api_annotation_diff=`python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec.annotations ${PADDLE_ROOT}/paddle/fluid/API_PR.spec.annotations` if [ "$api_spec_diff" != "" -o "${api_params_diff}" != "" ]; then echo_line="You must have one RD (XiaoguangHu01, jeff41404, lanxianghit or qingqing01) approval for API change.\n" check_approval 1 XiaoguangHu01 jeff41404 lanxianghit qingqing01 fi +if [ "$api_annotation_diff" != "" ]; then + echo_line="You must have one member of Typing group (SigureMo, megemini, zrr1999, sunzhongkai588, luotao1) approval for API annotation change.\n" + check_approval 1 SigureMo, megemini, zrr1999, sunzhongkai588, luotao1 +fi + api_yaml_diff=`python ${PADDLE_ROOT}/tools/check_api_yaml_same.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec ${PADDLE_ROOT}/paddle/fluid/API_PR.spec ${BRANCH} ${PADDLE_ROOT}` if [ "$api_yaml_diff" != "" ]; then echo_line="API's name and params should be consistent with op's name and params in yaml. @@ -133,7 +139,7 @@ if [ -n "${echo_list}" ];then echo "**************************************************************" # L40 L48 L62 has fetch the result out, but there are splitted. - if [ "${api_spec_diff}" != "" -o "${api_doc_spec_diff}" != "" ] ; then + if [ "${api_spec_diff}" != "" -o "${api_annotation_diff}" != "" ] ; then python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec ${PADDLE_ROOT}/paddle/fluid/API_PR.spec fi if [ "${api_params_diff}" != "" ] ; then diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh index d637c4f0c3b82..c844c09565da3 100644 --- a/tools/check_file_diff_approvals.sh +++ b/tools/check_file_diff_approvals.sh @@ -21,11 +21,12 @@ fi PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../" && pwd )" # If you want to add monitoring file modifications, please perform the. github/CODEOWNERS operation -API_FILES=("tools/print_signatures.py" - "tools/sampcd_processor.py" - "tools/check_pr_approval.py" - "tools/checkout_api_compatible.py" - ) +API_FILES=( + "tools/print_signatures.py" + "tools/sampcd_processor.py" + "tools/check_pr_approval.py" + "tools/checkout_api_compatible.py" +) approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000` git_files=`git diff --numstat upstream/$BRANCH| wc -l` diff --git a/tools/check_op_benchmark_result.py b/tools/check_op_benchmark_result.py index ca3df4bb99eef..82f7967133576 100644 --- a/tools/check_op_benchmark_result.py +++ b/tools/check_op_benchmark_result.py @@ -21,7 +21,7 @@ def check_path_exists(path): """Assert whether file/directory exists.""" - assert os.path.exists(path), "%s does not exist." % path + assert os.path.exists(path), f"{path} does not exist." def parse_case_name(log_file_name): @@ -48,7 +48,7 @@ def parse_log_file(log_file): pass # do nothing if result is None: - logging.warning("Parse %s fail!" % log_file) + logging.warning(f"Parse {log_file} fail!") return result @@ -81,29 +81,29 @@ def check_speed_result(case_name, develop_data, pr_data, pr_result): develop_total_time = develop_data.get("total") total_time_diff = (pr_total_time - develop_total_time) / develop_total_time - logging.info("------ OP: %s ------" % case_name) + logging.info(f"------ OP: {case_name} ------") logging.info( f"GPU time change: {gpu_time_diff_str} (develop: {develop_gpu_time:.7f} -> PR: {pr_gpu_time:.7f})" ) logging.info( f"Total time change: {total_time_diff * 100:.5f}% (develop: {develop_total_time:.7f} -> PR: {pr_total_time:.7f})" ) - logging.info("backward: %s" % pr_result.get("backward")) + logging.info("backward: {}".format(pr_result.get("backward"))) logging.info("parameters:") for line in pr_result.get("parameters").strip().split("\n"): - logging.info("\t%s" % line) + logging.info(f"\t{line}") return gpu_time_diff > 0.05 def check_accuracy_result(case_name, pr_result): """Check accuracy result.""" - logging.info("------ OP: %s ------" % case_name) - logging.info("Accuracy diff: %s" % pr_result.get("diff")) - logging.info("backward: %s" % pr_result.get("backward")) + logging.info(f"------ OP: {case_name} ------") + logging.info("Accuracy diff: {}".format(pr_result.get("diff"))) + logging.info("backward: {}".format(pr_result.get("backward"))) logging.info("parameters:") for line in pr_result.get("parameters").strip().split("\n"): - logging.info("\t%s" % line) + logging.info(f"\t{line}") return not pr_result.get("consistent") @@ -154,11 +154,11 @@ def update_api_info_file(fail_case_list, api_info_file): def summary_results(check_results, api_info_file): """Summary results and return sys.exit code.""" for case_name in check_results["speed"]: - logging.error("Check speed result with case \"%s\" failed." % case_name) + logging.error(f"Check speed result with case \"{case_name}\" failed.") for case_name in check_results["accuracy"]: logging.error( - "Check accuracy result with case \"%s\" failed." % case_name + f"Check accuracy result with case \"{case_name}\" failed." ) if len(check_results["speed"]) and api_info_file: diff --git a/tools/check_sequence_op.sh b/tools/check_sequence_op.sh index 35357476a3224..51a482c3e9306 100644 --- a/tools/check_sequence_op.sh +++ b/tools/check_sequence_op.sh @@ -1,13 +1,13 @@ #!/bin/bash # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. diff --git a/tools/ci_op_benchmark.sh b/tools/ci_op_benchmark.sh index 58e327327e6ad..93eb52a4f16aa 100644 --- a/tools/ci_op_benchmark.sh +++ b/tools/ci_op_benchmark.sh @@ -1,13 +1,13 @@ #!/bin/bash # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -333,6 +333,6 @@ fi case $1 in run_op_benchmark) prepare_env - gpu_op_benchmark + gpu_op_benchmark ;; esac diff --git a/tools/cinn/gen_c++_tutorial.py b/tools/cinn/gen_c++_tutorial.py index 97e6d16fef088..be391b44ef730 100644 --- a/tools/cinn/gen_c++_tutorial.py +++ b/tools/cinn/gen_c++_tutorial.py @@ -59,13 +59,13 @@ def code_block(self, lang: str, block: List[str]): break else: tail_valid_offset += 1 - logging.warning("block0: %s" % block) + logging.warning(f"block0: {block}") block = ( block[pre_valid_offset:-tail_valid_offset] if tail_valid_offset > 0 else block[pre_valid_offset:] ) - logging.warning("block1: %s" % block) + logging.warning(f"block1: {block}") if not block: return @@ -189,7 +189,7 @@ def eat_roc(self, header: str, content: ContentGenerator) -> None: code_block.append(line) line: str = content.get_line() - logging.warning("DOC content: %s" % code_block) + logging.warning(f"DOC content: {code_block}") self.doc.code_block(lang, code_block) diff --git a/tools/codestyle/clang-tidy.py b/tools/codestyle/clang-tidy.py index 404413b9b9945..7fe5029cd1823 100644 --- a/tools/codestyle/clang-tidy.py +++ b/tools/codestyle/clang-tidy.py @@ -166,9 +166,9 @@ def get_tidy_invocation( os.close(handle) start.append(name) for arg in extra_arg: - start.append('-extra-arg=%s' % arg) + start.append(f'-extra-arg={arg}') for arg in extra_arg_before: - start.append('-extra-arg-before=%s' % arg) + start.append(f'-extra-arg-before={arg}') start.append('-p=' + build_path) if quiet: start.append('-quiet') diff --git a/tools/continuous_integration/bisect.py b/tools/continuous_integration/bisect.py index c4b31bb6e8729..2feaf7be5ec6e 100644 --- a/tools/continuous_integration/bisect.py +++ b/tools/continuous_integration/bisect.py @@ -84,11 +84,11 @@ def print_arguments(): [f'git rev-list --first-parent {args.good_commit}...{args.bad_commit}'], shell=True, ) -sys.stdout.write('commits found:\n%s\n' % ret) +sys.stdout.write(f'commits found:\n{ret}\n') commits = ret.strip().split('\n') os.chdir(args.build_dir) # Clean up previous logs. -subprocess.check_output(['echo "" > %s' % args.log_file], shell=True) +subprocess.check_output([f'echo "" > {args.log_file}'], shell=True) last_culprit = '' while True: @@ -96,8 +96,7 @@ def print_arguments(): os.chdir(args.git_dir) subprocess.check_output( [ - 'git checkout %s && git clean -fd && git checkout .' - % args.bisect_branch + f'git checkout {args.bisect_branch} && git clean -fd && git checkout .' ], shell=True, ) @@ -109,7 +108,7 @@ def print_arguments(): pick_idx = len(commits) / 2 pick = commits[pick_idx] os.chdir(args.git_dir) - subprocess.check_output(['git checkout %s' % pick], shell=True) + subprocess.check_output([f'git checkout {pick}'], shell=True) # Clean builds and compile. # We assume mainline commits should always compile. @@ -120,7 +119,7 @@ def print_arguments(): 'rm -rf * && ' f'cmake -DWITH_TESTING=ON {args.git_dir} >> {args.log_file} && make -j{args.build_parallel} >> {args.log_file}' ) - sys.stdout.write('cmd: %s\n' % cmd) + sys.stdout.write(f'cmd: {cmd}\n') try: subprocess.check_output([cmd], shell=True) except subprocess.CalledProcessError as e: @@ -130,7 +129,7 @@ def print_arguments(): passed = True try: cmd = f'ctest --repeat-until-fail {args.test_times} -R {args.test_target} >> {args.log_file}' - sys.stdout.write('cmd: %s\n' % cmd) + sys.stdout.write(f'cmd: {cmd}\n') subprocess.check_output([cmd], shell=True) except subprocess.CalledProcessError as e: passed = False @@ -145,4 +144,4 @@ def print_arguments(): break commits = commits[pick_idx + 1 :] -sys.stdout.write('Culprit commit: %s\n' % last_culprit) +sys.stdout.write(f'Culprit commit: {last_culprit}\n') diff --git a/tools/document_preview.sh b/tools/document_preview.sh index 47c5207074046..97c01ee96d03b 100755 --- a/tools/document_preview.sh +++ b/tools/document_preview.sh @@ -1,13 +1,13 @@ #!/bin/bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -45,7 +45,7 @@ function get_docs_pr_num_from_paddle_pr_info(){ } # Attention: -# 1. /FluidDoc will be used as the workspace of PaddlePaddle/docs. +# 1. /FluidDoc will be used as the workspace of PaddlePaddle/docs. # 2. And /docs is used as the output of doc-build process. # 3. If conflicted with yours, please modify the definition of FLUIDDOCDIR and # OUTPUTDIR in the subsequent codes. diff --git a/tools/enforce/count_enforce_by_dir.sh b/tools/enforce/count_enforce_by_dir.sh index 77ffe9c158c7d..ba419f77f2bc1 100644 --- a/tools/enforce/count_enforce_by_dir.sh +++ b/tools/enforce/count_enforce_by_dir.sh @@ -15,10 +15,10 @@ # limitations under the License. # This script is used to count detail PADDLE checks in the paddle/fluid directory, -# contains the number of PADDLE checks under each folder, the statistical data +# contains the number of PADDLE checks under each folder, the statistical data # does not include subdirectories, only covers all files under the current directory. -# -# The three columns of data are: total number, valid number, invalid number. +# +# The three columns of data are: total number, valid number, invalid number. # The output format is easy to display as a markdown table. # Usage: bash count_enforce_by_dir.sh (run in tools directory) @@ -70,8 +70,8 @@ function count_dir_independently(){ enforce_count $1"/"$file dir_total_check_cnt dir_valid_check_cnt sub_dir_total_check_cnt=$(($sub_dir_total_check_cnt+$dir_total_check_cnt)) sub_dir_valid_check_cnt=$(($sub_dir_valid_check_cnt+$dir_valid_check_cnt)) - - count_dir_independently $1"/"$file $dir_total_check_cnt $dir_valid_check_cnt + + count_dir_independently $1"/"$file $dir_total_check_cnt $dir_valid_check_cnt fi done total_check_cnt=$(($2-$sub_dir_total_check_cnt)) diff --git a/tools/enforce/count_enforce_by_file.sh b/tools/enforce/count_enforce_by_file.sh index c79d486c62838..b06514a4e03bb 100644 --- a/tools/enforce/count_enforce_by_file.sh +++ b/tools/enforce/count_enforce_by_file.sh @@ -16,8 +16,8 @@ # This script is used to count PADDLE checks by files in the paddle/fluid/operators directory, # contains the number of PADDLE checks under each file. -# -# The three columns of data are: total number, valid number, invalid number. +# +# The three columns of data are: total number, valid number, invalid number. # The output format is easy to display as a markdown table. # Usage: bash count_enforce_by_file.sh [target directory or file] (run in tools directory) diff --git a/tools/externalError/start.sh b/tools/externalError/start.sh index d60a26d157cce..057a67ef46a41 100644 --- a/tools/externalError/start.sh +++ b/tools/externalError/start.sh @@ -1,13 +1,13 @@ #!/usr/bin/env bash # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. diff --git a/tools/final_ut_parallel_rule.py b/tools/final_ut_parallel_rule.py index e0fc86c19a8cc..2dbfdd39c1a2c 100644 --- a/tools/final_ut_parallel_rule.py +++ b/tools/final_ut_parallel_rule.py @@ -19,7 +19,7 @@ def classify_cases_by_mem(rootPath): """classify cases by mem""" - case_filename = '%s/build/classify_case_by_cardNum.txt' % rootPath + case_filename = f'{rootPath}/build/classify_case_by_cardNum.txt' case_exec_100 = [ 'test_conv_eltwiseadd_bn_fuse_pass', 'test_trt_convert_pool2d', @@ -124,14 +124,14 @@ def classify_cases_by_mem(rootPath): else: case_mem_1[case] = new_lastest_mem[case]["mem_nvidia"] - with open('/pre_test/%s_mem0' % cardType, 'w') as f: + with open(f'/pre_test/{cardType}_mem0', 'w') as f: f.write(case_mem_0) f.close() case_mem_1_sort = sorted(case_mem_1.items(), key=lambda x: x[1]) case_mem_1_line = '^job$' mem_1_sum = 0 - with open('/pre_test/%s' % cardType, 'w') as f_not_0: + with open(f'/pre_test/{cardType}', 'w') as f_not_0: for index in case_mem_1_sort: if mem_1_sum < 14 * 1024 * 2: mem_1_sum += index[1] @@ -150,7 +150,7 @@ def classify_cases_by_mem(rootPath): f_not_0.write(case_mem_1_line + '\n') f_not_0.close() - os.system('cp %s/build/nightly_case /pre_test/' % rootPath) + os.system(f'cp {rootPath}/build/nightly_case /pre_test/') if __name__ == '__main__': diff --git a/tools/gen_alias_mapping.sh b/tools/gen_alias_mapping.sh index 3ab1e68b37557..c57f3f6bba2b1 100755 --- a/tools/gen_alias_mapping.sh +++ b/tools/gen_alias_mapping.sh @@ -17,16 +17,16 @@ # Brief: # This code is used for generating the mapping list of Paddle API alias. # Only the APIs set with the `DEFINE_ALIAS` flag is enable. -# +# # Arguments: # None -# +# # Usage: -# Go into the `Paddle` folder and just run `./tools/gen_alias_mapping.sh` +# Go into the `Paddle` folder and just run `./tools/gen_alias_mapping.sh` # # Returns: # succ: 0 -# +# # Will also print the mapping list to stdout. The format of each line is as below: # <real API implement>\t<API recommend>,<API other alias name1>,<API other alias name2>,... @@ -38,7 +38,7 @@ find ${PADDLE_ROOT}/python/ -name '*.py' \ | grep 'DEFINE_ALIAS' \ | perl -ne ' if (/\/python\/(.*):from (\.*)(\w.*) import (.*?)\s+#DEFINE_ALIAS\s+$/) { - my @arr = split(", ", $4); + my @arr = split(", ", $4); foreach $i (@arr) { printf "%s|%s|%s|%d\n", $3, $i, substr($1, 0, -3), length($2); } @@ -66,7 +66,7 @@ find ${PADDLE_ROOT}/python/ -name '*.py' \ } key = key""new; n2o[key] = val; - } + } END { for (new in n2o) { old = n2o[new] in n2o ? n2o[n2o[new]] : n2o[new]; @@ -78,7 +78,7 @@ find ${PADDLE_ROOT}/python/ -name '*.py' \ { o2n[$1] = o2n[$1] ? o2n[$1]","$3 : $3; } - END { + END { for (i in o2n) { print i"\t"o2n[i]; } diff --git a/tools/gen_tensor_stub.py b/tools/gen_tensor_stub.py index 00c7fb0c2e50c..422b3004f5266 100644 --- a/tools/gen_tensor_stub.py +++ b/tools/gen_tensor_stub.py @@ -15,17 +15,18 @@ from __future__ import annotations import argparse +import importlib import inspect import logging import re +import sys +import types from dataclasses import dataclass from functools import cached_property, lru_cache from typing import Any, Callable, Literal from typing_extensions import TypeAlias -import paddle - logging.basicConfig(style="{", format="{message}", level=logging.INFO) logger = logging.getLogger("Generating stub file for paddle.Tensor") logger.setLevel(logging.INFO) @@ -102,7 +103,6 @@ def find_apis(self, api_name: str) -> list[dict[str, tuple[str, int, int]]]: api = [] for mo in pattern.finditer(self._template): _indent = mo.group('indent') - _def_api = mo.group('def_api') _signature = mo.group('signature') _docstring = mo.group('docstring') _ellipsis = mo.group('ellipsis') @@ -110,26 +110,15 @@ def find_apis(self, api_name: str) -> list[dict[str, tuple[str, int, int]]]: _comment = '' if _comment is None else _comment _start_index, _end_index = mo.span() - - _start_indent = _start_index - _end_indent = _start_indent + len(_indent) - - _start_def_api = _end_indent - _end_def_api = _start_def_api + len(_def_api) - - _start_signature = _end_def_api - _end_signature = _start_signature + len(_signature) - - _start_docstring = _end_signature - _end_docstring = _start_docstring + len(_docstring) - - _start_ellipsis = _end_docstring - _end_ellipsis = _start_ellipsis + len(_ellipsis) - + _start_indent, _end_indent = mo.span('indent') + _start_signature, _end_signature = mo.span('signature') + _start_docstring, _end_docstring = mo.span('docstring') + _start_ellipsis, _end_ellipsis = mo.span('ellipsis') _start_comment = _end_ellipsis _end_comment = _start_comment + len(_comment) - assert _end_index == _end_comment + assert _start_index == _start_indent + assert _end_comment == _end_index _api = { 'indent': (_indent, _start_indent, _end_indent), @@ -216,7 +205,10 @@ def add_doc(self, doc: str): self.insert_template(docstring, _end_index, _end_index) def codegen(self) -> str: - return self._template + header = ( + '# This file is auto generated by `tools/gen_tensor_stub.py`.\n\n' + ) + return header + self._template def is_inherited_member(name: str, cls: type) -> bool: @@ -336,7 +328,27 @@ def func_doc_to_method_doc(func_doc: str) -> str: return method_doc +def try_import_paddle() -> types.ModuleType | None: + try: + return importlib.import_module('paddle') + except ModuleNotFoundError: + sys.stderr.write( + '''ERROR: Can NOT import paddle. + We could import paddle without installation, with all libs (.dll or .so) copied into dir `paddle/libs`, + or path already been set for the system. + ''' + ) + + def get_tensor_members(): + paddle = try_import_paddle() + if not paddle: + raise ( + ModuleNotFoundError( + 'Can NOT import paddle from tools/gen_tensor_stub.py.' + ) + ) + tensor_class = paddle.Tensor members: dict[int, Member] = {} @@ -433,7 +445,7 @@ def get_tensor_template(path: str) -> str: return ''.join(f.readlines()) -def main(): +def parse_args(): parser = argparse.ArgumentParser() parser.add_argument( @@ -442,7 +454,6 @@ def main(): type=str, default="python/paddle/tensor/tensor.prototype.pyi", ) - parser.add_argument( "-o", "--output-file", @@ -452,12 +463,16 @@ def main(): args = parser.parse_args() + return args + + +def generate_stub_file(input_file=None, output_file=None): # Get members of Tensor tensor_members = get_tensor_members() logging.debug(f'total members in Tensor: {len(tensor_members)}') # Get tensor template - tensor_template = get_tensor_template(args.input_file) + tensor_template = get_tensor_template(input_file) # Generate the Tensor stub tensor_gen = TensorGen(tensor_template) @@ -473,9 +488,14 @@ def main(): tensor_gen.add_doc(member.doc) # Write to target file - with open(args.output_file, "w", encoding="utf-8") as f: + with open(output_file, "w", encoding="utf-8") as f: f.write(tensor_gen.codegen()) +def main(): + args = parse_args() + generate_stub_file(args.input_file, args.output_file) + + if __name__ == "__main__": main() diff --git a/tools/get_build_time.sh b/tools/get_build_time.sh index 496c8c12d6ca3..85100bb50c761 100755 --- a/tools/get_build_time.sh +++ b/tools/get_build_time.sh @@ -1,13 +1,13 @@ #!/bin/bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. diff --git a/tools/get_cpu_info.sh b/tools/get_cpu_info.sh index bce338a8619e6..b7ec2e77a3a84 100755 --- a/tools/get_cpu_info.sh +++ b/tools/get_cpu_info.sh @@ -1,13 +1,13 @@ #!/bin/bash # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -54,7 +54,7 @@ echo "OS Version : `uname -o`" echo "Kernel Release Version : `uname -r`" echo "Kernel Patch Version : `uname -v`" echo "GCC Version :`gcc --version | head -n 1|awk -F '\\\(GCC\\\)' '{print $2}'`" -if command -v cmake >/dev/null 2>&1; then +if command -v cmake >/dev/null 2>&1; then cmake_ver=`cmake --version | head -n 1 | awk -F 'version' '{print $2}'` else cmake_ver=" Not installed" diff --git a/tools/get_op_list.sh b/tools/get_op_list.sh index 2e4cad13582df..2b5d7f419b1d2 100644 --- a/tools/get_op_list.sh +++ b/tools/get_op_list.sh @@ -1,11 +1,11 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. diff --git a/tools/get_ut_file_map.py b/tools/get_ut_file_map.py index bf469eab98747..42b1c251f19a1 100644 --- a/tools/get_ut_file_map.py +++ b/tools/get_ut_file_map.py @@ -19,8 +19,8 @@ def get_all_paddle_file(rootPath): """get all file in Paddle repo: paddle/fluild, python""" - traverse_files = ['%s' % rootPath] - all_file_paddle = '%s/build/all_file_paddle' % rootPath + traverse_files = [f'{rootPath}'] + all_file_paddle = f'{rootPath}/build/all_file_paddle' all_file_paddle_list = [] with open(all_file_paddle, 'w') as f: for filename in traverse_files: @@ -32,7 +32,7 @@ def get_all_paddle_file(rootPath): def get_all_uts(rootPath): - all_uts_paddle = '%s/build/all_uts_paddle' % rootPath + all_uts_paddle = f'{rootPath}/build/all_uts_paddle' os.system( fr'cd {rootPath}/build && ctest -N -V | grep -Ei "Test[ \t]+#" | grep -oEi "\w+$" > {all_uts_paddle}' ) @@ -42,28 +42,28 @@ def remove_useless_file(rootPath): """remove useless file in ut_file_map.json""" all_file_paddle_list = get_all_paddle_file(rootPath) ut_file_map_new = {} - ut_file_map = "%s/build/ut_file_map.json" % rootPath + ut_file_map = f"{rootPath}/build/ut_file_map.json" with open(ut_file_map, 'r') as load_f: load_dict = json.load(load_f) for key in load_dict: if key in all_file_paddle_list: ut_file_map_new[key] = load_dict[key] - with open("%s/build/ut_file_map.json" % rootPath, "w") as f: + with open(f"{rootPath}/build/ut_file_map.json", "w") as f: json.dump(ut_file_map_new, f, indent=4) print("remove_useless_file ut_file_map success!!") def handle_ut_file_map(rootPath): utNotSuccess_list = [] - ut_map_path = "%s/build/ut_map" % rootPath + ut_map_path = f"{rootPath}/build/ut_map" files = os.listdir(ut_map_path) ut_file_map = {} count = 0 - not_success_file = open("%s/build/prec_delta" % rootPath, 'w') + not_success_file = open(f"{rootPath}/build/prec_delta", 'w') # if testdir is not made,write the test into prec_delta get_all_uts(rootPath) - all_ut = '%s/build/all_uts_paddle' % rootPath + all_ut = f'{rootPath}/build/all_uts_paddle' with open(all_ut, 'r') as f: all_ut_list = [] for ut in f.readlines(): @@ -73,7 +73,7 @@ def handle_ut_file_map(rootPath): for ut in all_ut_list: filedir = f'{rootPath}/build/ut_map/{ut}' if not os.path.exists(filedir): - not_success_file.write('%s\n' % ut) + not_success_file.write(f'{ut}\n') utNotSuccess_list.append(ut) # if fnda.tmp not exists,write the test into prec_delta for ut in files: @@ -108,7 +108,7 @@ def handle_ut_file_map(rootPath): ut_file_map[source_file].append(ut) f.close() else: - not_success_file.write('%s\n' % ut) + not_success_file.write(f'{ut}\n') utNotSuccess_list.append(ut) not_success_file.close() @@ -135,13 +135,13 @@ def handle_ut_file_map(rootPath): if source_file not in ut_file_map: ut_file_map[source_file] = [] f.close() - with open("%s/build/ut_file_map.json" % rootPath, "w") as f: + with open(f"{rootPath}/build/ut_file_map.json", "w") as f: json.dump(ut_file_map, f, indent=4) def notsuccessfuc(rootPath): utNotSuccess = '' - ut_map_path = "%s/build/ut_map" % rootPath + ut_map_path = f"{rootPath}/build/ut_map" files = os.listdir(ut_map_path) count = 0 @@ -154,7 +154,7 @@ def notsuccessfuc(rootPath): pass else: count = count + 1 - utNotSuccess = utNotSuccess + '^%s$|' % ut + utNotSuccess = utNotSuccess + f'^{ut}$|' # ut not exec @@ -166,7 +166,7 @@ def notsuccessfuc(rootPath): if ut not in files: print(ut) count = count + 1 - utNotSuccess = utNotSuccess + '^%s$|' % ut + utNotSuccess = utNotSuccess + f'^{ut}$|' if utNotSuccess != '': print("utNotSuccess count: %s" % count) @@ -176,18 +176,17 @@ def notsuccessfuc(rootPath): def ut_file_map_supplement(rootPath): - ut_file_map_new = "%s/build/ut_file_map.json" % rootPath + ut_file_map_new = f"{rootPath}/build/ut_file_map.json" precision_test_map_store_dir = "/precision_test_map_store" - os.system('mkdir %s' % precision_test_map_store_dir) + os.system(f'mkdir {precision_test_map_store_dir}') os.system( - 'cd %s && wget --no-proxy https://paddle-docker-tar.bj.bcebos.com/tmp_test/ut_file_map.json --no-check-certificate' - % precision_test_map_store_dir + f'cd {precision_test_map_store_dir} && wget --no-proxy https://paddle-docker-tar.bj.bcebos.com/tmp_test/ut_file_map.json --no-check-certificate' ) - ut_file_map_old = "%s/ut_file_map.json" % precision_test_map_store_dir + ut_file_map_old = f"{precision_test_map_store_dir}/ut_file_map.json" with open(ut_file_map_new, 'r') as load_f: load_dict_new = json.load(load_f) - all_uts_paddle = '%s/build/all_uts_paddle' % rootPath + all_uts_paddle = f'{rootPath}/build/all_uts_paddle' with open(all_uts_paddle, 'r') as f: all_uts_paddle_list = [] @@ -195,15 +194,14 @@ def ut_file_map_supplement(rootPath): all_uts_paddle_list.append(ut.strip()) f.close() - with open("%s/ut_file_map.json" % precision_test_map_store_dir, "w") as f: + with open(f"{precision_test_map_store_dir}/ut_file_map.json", "w") as f: json.dump(load_dict_new, f, indent=4) print("load_dict_new success!!") os.system( - 'cd %s && wget --no-proxy https://paddle-docker-tar.bj.bcebos.com/tmp_test/prec_delta --no-check-certificate' - % precision_test_map_store_dir + f'cd {precision_test_map_store_dir} && wget --no-proxy https://paddle-docker-tar.bj.bcebos.com/tmp_test/prec_delta --no-check-certificate' ) - prec_delta_new = "%s/build/prec_delta" % rootPath + prec_delta_new = f"{rootPath}/build/prec_delta" with open(prec_delta_new, 'r') as f: prec_delta_new_list = [] for ut in f.readlines(): @@ -212,7 +210,7 @@ def ut_file_map_supplement(rootPath): prec_delta_new_list.append( 'test_py_reader_error_msg' ) # add a python case for pycoverage - prec_delta_file = open("%s/prec_delta" % precision_test_map_store_dir, 'w') + prec_delta_file = open(f"{precision_test_map_store_dir}/prec_delta", 'w') for ut in prec_delta_new_list: prec_delta_file.write(ut + '\n') print("prec_delta_file success!!") @@ -220,7 +218,7 @@ def ut_file_map_supplement(rootPath): def utmap_analysis(rootPath): - ut_file_map_new = "%s/build/ut_file_map.json" % rootPath + ut_file_map_new = f"{rootPath}/build/ut_file_map.json" with open(ut_file_map_new, 'r') as load_f: load_dict_new = json.load(load_f) print(len(load_dict_new)) diff --git a/tools/gpups_test.sh b/tools/gpups_test.sh index 07122405a21d7..e0669fb85e658 100644 --- a/tools/gpups_test.sh +++ b/tools/gpups_test.sh @@ -1,11 +1,11 @@ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -40,7 +40,7 @@ function get_quickly_disable_ut() { fi } -# disable test: +# disable test: # test_dygraph_dataparallel_bf16 # test_dygraph_sharding_stage2_bf16 # test_dygraph_sharding_stage3_bf16 diff --git a/tools/group_case_for_parallel.py b/tools/group_case_for_parallel.py index 66187ca4b0607..9af5e084bada2 100644 --- a/tools/group_case_for_parallel.py +++ b/tools/group_case_for_parallel.py @@ -40,29 +40,29 @@ def group_case_for_parallel(rootPath): ) # get nightly tests - nightly_tests_file = open('%s/tools/nightly_case' % rootPath, 'r') + nightly_tests_file = open(f'{rootPath}/tools/nightly_case', 'r') nightly_tests = nightly_tests_file.read().strip().split('\n') nightly_tests_file.close() parallel_case_file_list = [ - '%s/tools/single_card_tests_mem0' % rootPath, - '%s/tools/single_card_tests' % rootPath, - '%s/tools/multiple_card_tests_mem0' % rootPath, - '%s/tools/multiple_card_tests' % rootPath, - '%s/tools/exclusive_card_tests_mem0' % rootPath, - '%s/tools/exclusive_card_tests' % rootPath, + f'{rootPath}/tools/single_card_tests_mem0', + f'{rootPath}/tools/single_card_tests', + f'{rootPath}/tools/multiple_card_tests_mem0', + f'{rootPath}/tools/multiple_card_tests', + f'{rootPath}/tools/exclusive_card_tests_mem0', + f'{rootPath}/tools/exclusive_card_tests', ] - case_file = '%s/build/ut_list' % rootPath + case_file = f'{rootPath}/build/ut_list' if os.path.exists(case_file): f = open(case_file, 'r') all_need_run_cases = f.read().strip().split('\n') if len(all_need_run_cases) == 1 and all_need_run_cases[0] == '': f.close() - case_file = '%s/build/all_ut_list' % rootPath + case_file = f'{rootPath}/build/all_ut_list' f = open(case_file, 'r') all_need_run_cases = f.read().strip().split('\n') else: - case_file = '%s/build/all_ut_list' % rootPath + case_file = f'{rootPath}/build/all_ut_list' f = open(case_file, 'r') all_need_run_cases = f.read().strip().split('\n') @@ -71,7 +71,7 @@ def group_case_for_parallel(rootPath): all_group_case = [] for filename in parallel_case_file_list: fi = open(filename, 'r') - new_f = open('%s_new' % filename, 'w') + new_f = open(f'{filename}_new', 'w') lines = fi.readlines() new_case_file_list = [] for line in lines: @@ -88,7 +88,7 @@ def group_case_for_parallel(rootPath): for line in new_case_file_list: cases = '$|^'.join(case for case in line) - cases = '^job$|^%s$' % cases + cases = f'^job$|^{cases}$' new_f.write(cases + '\n') fi.close() new_f.close() @@ -98,10 +98,10 @@ def group_case_for_parallel(rootPath): if len(all_need_run_cases) != 0: for case in all_need_run_cases: if case not in nightly_tests: - cases = cases + '$|^%s' % case - cases = '%s$' % cases + cases = cases + f'$|^{case}' + cases = f'{cases}$' - new_f = open('%s/tools/no_parallel_case_file' % rootPath, 'w') + new_f = open(f'{rootPath}/tools/no_parallel_case_file', 'w') new_f.write(cases + '\n') new_f.close() f.close() diff --git a/tools/handle_h_cu_file.py b/tools/handle_h_cu_file.py index 86458045d3de8..656e47fdba896 100644 --- a/tools/handle_h_cu_file.py +++ b/tools/handle_h_cu_file.py @@ -43,7 +43,7 @@ def threadPool(threadPoolNum): def get_h_file_md5(rootPath): - h_cu_files = '%s/tools/h_cu_files.log' % rootPath + h_cu_files = f'{rootPath}/tools/h_cu_files.log' f = open(h_cu_files) lines = f.readlines() for line in lines: @@ -52,7 +52,7 @@ def get_h_file_md5(rootPath): def insert_pile_to_h_file(rootPath): - h_cu_files = '%s/tools/h_cu_files.log' % rootPath + h_cu_files = f'{rootPath}/tools/h_cu_files.log' f = open(h_cu_files) lines = f.readlines() for line in lines: @@ -60,7 +60,7 @@ def insert_pile_to_h_file(rootPath): func = line.replace('/', '_').replace('.', '_') os.system(f'echo "\n#ifndef _PRECISE{func.upper()}_\n" >> {line}') os.system(f'echo "#define _PRECISE{func.upper()}_" >> {line}') - os.system('echo "\n#include <cstdio>\n" >> %s' % line) + os.system(f'echo "\n#include <cstdio>\n" >> {line}') os.system( f'echo "__attribute__((constructor)) static void calledFirst{func}()\n{{" >> {line}' ) @@ -68,43 +68,40 @@ def insert_pile_to_h_file(rootPath): 'echo \' fprintf(stderr,"precise test map fileeee: %%s\\\\n", __FILE__);\n}\' >> %s' % line ) - os.system('echo "\n#endif" >> %s' % line) + os.system(f'echo "\n#endif" >> {line}') def add_simple_cxx_test(rootPath): - variant_test_path = '%s/paddle/utils/variant_test.cc' % rootPath - variant_test_cmakeflie_path = '%s/paddle/utils/CMakeLists.txt' % rootPath + variant_test_path = f'{rootPath}/paddle/utils/variant_test.cc' + variant_test_cmakeflie_path = f'{rootPath}/paddle/utils/CMakeLists.txt' if os.path.exists(variant_test_path) and os.path.exists( variant_test_cmakeflie_path ): - simple_test_path = '%s/paddle/utils/simple_precision_test.cc' % rootPath - os.system('touch %s' % simple_test_path) + simple_test_path = f'{rootPath}/paddle/utils/simple_precision_test.cc' + os.system(f'touch {simple_test_path}') + os.system(f"echo '#include \"gtest/gtest.h\"\n' >> {simple_test_path}") os.system( - "echo '#include \"gtest/gtest.h\"\n' >> %s" % simple_test_path - ) - os.system( - 'echo "TEST(interface_test, type) { }\n" >> %s' % simple_test_path + f'echo "TEST(interface_test, type) {{ }}\n" >> {simple_test_path}' ) os.system('echo "cc_test(" >> %s' % variant_test_cmakeflie_path) os.system( - 'echo " simple_precision_test" >> %s' % variant_test_cmakeflie_path + f'echo " simple_precision_test" >> {variant_test_cmakeflie_path}' ) os.system( - 'echo " SRCS simple_precision_test.cc" >> %s' - % variant_test_cmakeflie_path + f'echo " SRCS simple_precision_test.cc" >> {variant_test_cmakeflie_path}' ) - os.system('echo " DEPS gtest)\n" >> %s' % variant_test_cmakeflie_path) + os.system(f'echo " DEPS gtest)\n" >> {variant_test_cmakeflie_path}') def remove_pile_from_h_file(rootPath): - h_cu_files = '%s/tools/h_cu_files.log' % rootPath + h_cu_files = f'{rootPath}/tools/h_cu_files.log' f = open(h_cu_files) lines = f.readlines() count = 12 for line in lines: line = line.strip() while count > 0: - os.system("sed -i '$d' %s" % line) + os.system(f"sed -i '$d' {line}") count = count - 1 count = 12 diff --git a/tools/nvcc_lazy.sh b/tools/nvcc_lazy.sh index 31e1a44540133..bb851c11df6db 100755 --- a/tools/nvcc_lazy.sh +++ b/tools/nvcc_lazy.sh @@ -17,7 +17,7 @@ echo "#!/usr/bin/env bash" >> $1 echo "unset GREP_OPTIONS" >> $1 echo "set -e" >> $1 -echo -e >> $1 +echo -e >> $1 echo "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved." >> $1 echo "#" >> $1 echo "# Licensed under the Apache License, Version 2.0 (the \"License\");" >> $1 @@ -25,7 +25,7 @@ echo "# you may not use this file except in compliance with the License." >> $1 echo "# You may obtain a copy of the License at" >> $1 echo "#" >> $1 echo "# http://www.apache.org/licenses/LICENSE-2.0" >> $1 -echo "#" >> $1 +echo "#" >> $1 echo "# Unless required by applicable law or agreed to in writing, software" >> $1 echo "# distributed under the License is distributed on an \"AS IS\" BASIS," >> $1 echo "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." >> $1 diff --git a/tools/print_signatures.py b/tools/print_signatures.py index d09a04abd045c..ba3e08b154541 100644 --- a/tools/print_signatures.py +++ b/tools/print_signatures.py @@ -15,19 +15,34 @@ Print all signature of a python module in alphabet order. Usage: - ./print_signature "paddle.base" > signature.txt + python tools/print_signature.py "paddle" > API.spec """ +from __future__ import annotations + import argparse import collections import hashlib import inspect import logging import pkgutil +import re import sys +from typing import Literal import paddle +SpecFields = Literal[ + "args", + "varargs", + "varkw", + "defaults", + "kwonlyargs", + "kwonlydefaults", + "annotations", + "document", +] + member_dict = collections.OrderedDict() visited_modules = set() @@ -61,21 +76,6 @@ def md5(doc): return md5sum -def is_primitive(instance): - int_types = (int,) - pritimitive_types = int_types + (float, str) - if isinstance(instance, pritimitive_types): - return True - elif isinstance(instance, (list, tuple, set)): - for obj in instance: - if not is_primitive(obj): - return False - - return True - else: - return False - - ErrorSet = set() IdSet = set() skiplist = [] @@ -200,9 +200,7 @@ def insert_api_into_dict(full_name, gen_doc_anno=None): if gen_doc_anno: api_info_dict[fc_id]["gen_doc_anno"] = gen_doc_anno if inspect.isfunction(obj): - api_info_dict[fc_id]["signature"] = repr( - inspect.getfullargspec(obj) - ).replace('FullArgSpec', 'ArgSpec', 1) + api_info_dict[fc_id]["signature"] = inspect.getfullargspec(obj) return api_info_dict[fc_id] @@ -239,85 +237,6 @@ def process_module(m, attr="__all__"): return api_counter -def check_public_api(): - modulelist = [ # npqa - paddle, - paddle.amp, - paddle.nn, - paddle.nn.functional, - paddle.nn.initializer, - paddle.nn.utils, - paddle.static, - paddle.static.nn, - paddle.io, - paddle.jit, - paddle.metric, - paddle.distribution, - paddle.optimizer, - paddle.optimizer.lr, - paddle.regularizer, - paddle.text, - paddle.utils, - paddle.utils.download, - paddle.utils.cpp_extension, - paddle.sysconfig, - paddle.vision, - paddle.vision.datasets, - paddle.vision.models, - paddle.vision.transforms, - paddle.vision.ops, - paddle.distributed, - paddle.distributed.fleet, - paddle.distributed.fleet.utils, - paddle.distributed.parallel, - paddle.distributed.utils, - paddle.callbacks, - paddle.hub, - paddle.autograd, - paddle.incubate, - paddle.inference, - paddle.onnx, - paddle.device, - paddle.audio, - paddle.audio.backends, - paddle.audio.datasets, - paddle.sparse, - paddle.sparse.nn, - paddle.sparse.nn.functional, - ] - - apinum = 0 - alldict = {} - for module in modulelist: - if hasattr(module, '__all__'): - old_all = module.__all__ - else: - old_all = [] - dirall = dir(module) - for item in dirall: - if item.startswith('__'): - continue - old_all.append(item) - apinum += len(old_all) - alldict.update({module.__name__: old_all}) - - old_all = [] - dirall = dir(paddle.Tensor) - for item in dirall: - if item.startswith('_'): - continue - old_all.append(item) - apinum += len(old_all) - alldict.update({'paddle.Tensor': old_all}) - - for module, allapi in alldict.items(): - for member_name in allapi: - cur_name = module + '.' + member_name - instance = eval(cur_name) - doc_md5 = md5(instance.__doc__) - member_dict[cur_name] = f"({cur_name}, ('document', '{doc_md5}'))" - - def check_allmodule_callable(): modulelist = [paddle] for m in modulelist: @@ -326,69 +245,89 @@ def check_allmodule_callable(): return member_dict +class ApiSpecFormatter: + def __init__(self, show_fields: SpecFields): + self.show_fields = show_fields + + def format_spec(self, spec: inspect.FullArgSpec | None) -> str: + if spec is None: + return "ArgSpec()" + inner_str = ", ".join( + f"{field}={getattr(spec, field)!r}" + for field in spec._fields + if field in self.show_fields + ) + return f"ArgSpec({inner_str})" + + def format_doc(self, doc: str) -> str: + if "document" not in self.show_fields: + return "('document', '**********')" + return f"('document', '{md5(doc)}')" + + def format(self, api_name: str, spec: inspect.FullArgSpec, doc: str) -> str: + return f"{api_name} ({self.format_spec(spec)}, {self.format_doc(doc)})" + + def parse_args(): """ Parse input arguments """ parser = argparse.ArgumentParser(description='Print Apis Signatures') - parser.add_argument('--debug', dest='debug', action="store_true") + parser.add_argument('module', type=str, help='module', default='paddle') parser.add_argument( - '--method', - dest='method', + '--skipped', + dest='skipped', type=str, - default='get_all_api', - help="using get_all_api or from_modulelist", + help='Skip Checking submodules, support regex', + default=r'paddle\.base\.libpaddle\.(eager|pir)\.ops', ) parser.add_argument( - 'module', type=str, help='module', default='paddle' - ) # not used - parser.add_argument( - '--skipped', - dest='skipped', + '--show-fields', type=str, - help='Skip Checking submodules', - default='paddle.base.libpaddle.eager.ops', + default="args,varargs,varkw,defaults,kwonlyargs,kwonlydefaults,annotations,document", + help="show fields in arg spec, separated by comma, e.g. 'args,varargs'", ) - - if len(sys.argv) == 1: - args = parser.parse_args(['paddle']) - return args - # parser.print_help() - # sys.exit(1) - args = parser.parse_args() return args +def create_api_filter(skipped_regex: str): + if not skipped_regex: + return lambda api_name: True + skipped_pattern = re.compile(skipped_regex) + + def api_filter(api_name: str) -> bool: + return not skipped_pattern.match(api_name) + + return api_filter + + if __name__ == '__main__': args = parse_args() check_allmodule_callable() - if args.method == 'from_modulelist': - check_public_api() - for name in member_dict: - print(name, member_dict[name]) - elif args.method == 'get_all_api': - get_all_api() - all_api_names_to_k = {} - for k, api_info in api_info_dict.items(): - # 1. the shortest suggested_name may be renamed; - # 2. some api's fullname is not accessable, the module name of it is overrided by the function with the same name; - api_name = sorted(api_info['all_names'])[0] - all_api_names_to_k[api_name] = k - all_api_names_sorted = sorted(all_api_names_to_k.keys()) - for api_name in all_api_names_sorted: - if args.skipped != '' and api_name.find(args.skipped) >= 0: - continue - api_info = api_info_dict[all_api_names_to_k[api_name]] - print( - "{} ({}, ('document', '{}'))".format( - api_name, - api_info['signature'] - if 'signature' in api_info - else 'ArgSpec()', - md5(api_info['docstring']), - ) + get_all_api(args.module) + api_filter = create_api_filter(args.skipped) + spec_formatter = ApiSpecFormatter(args.show_fields.split(',')) + + all_api_names_to_k = {} + for k, api_info in api_info_dict.items(): + # 1. the shortest suggested_name may be renamed; + # 2. some api's fullname is not accessable, the module name of it is overrided by the function with the same name; + api_name = sorted(api_info['all_names'])[0] + all_api_names_to_k[api_name] = k + all_api_names_sorted = sorted(all_api_names_to_k.keys()) + for api_name in all_api_names_sorted: + if not api_filter(api_name): + continue + api_info = api_info_dict[all_api_names_to_k[api_name]] + + print( + spec_formatter.format( + api_name, + api_info.get('signature'), + api_info['docstring'], ) + ) if len(ErrorSet) == 0: sys.exit(0) diff --git a/tools/prune_for_jetson.py b/tools/prune_for_jetson.py index 12f15a5dec6e1..d3758493d0c00 100644 --- a/tools/prune_for_jetson.py +++ b/tools/prune_for_jetson.py @@ -101,9 +101,9 @@ def prune_phi_kernels(): def apply_patches(): work_path = os.path.dirname(os.path.abspath(__file__)) + "/../" ret = os.system( - "cd %s && rm -f paddle/fluid/inference/api/tensorrt_predictor.* " + f"cd {work_path} && rm -f paddle/fluid/inference/api/tensorrt_predictor.* " " && rm -f paddle/fluid/inference/api/paddle_tensorrt_predictor.h " - " && git apply tools/infer_prune_patches/*.patch && cd -" % work_path + " && git apply tools/infer_prune_patches/*.patch && cd -" ) return ret == 0 @@ -120,7 +120,7 @@ def append_fluid_kernels(): for op in op_white_list: append_str = ( append_str - + "file(APPEND ${pybind_file} \"USE_OP__(%s);\\n\")\n" % op + + f"file(APPEND ${{pybind_file}} \"USE_OP__({op});\\n\")\n" ) with open(file_name, 'r', encoding='utf-8') as f: @@ -154,11 +154,9 @@ def append_fluid_kernels(): for op in op_white_list: patterns = { - "REGISTER_OPERATOR": r"REGISTER_OPERATOR\(\s*%s\s*," % op, - "REGISTER_OP_CPU_KERNEL": r"REGISTER_OP_CPU_KERNEL\(\s*%s\s*," - % op, - "REGISTER_OP_CUDA_KERNEL": r"REGISTER_OP_CUDA_KERNEL\(\s*%s\s*," - % op, + "REGISTER_OPERATOR": rf"REGISTER_OPERATOR\(\s*{op}\s*,", + "REGISTER_OP_CPU_KERNEL": rf"REGISTER_OP_CPU_KERNEL\(\s*{op}\s*,", + "REGISTER_OP_CUDA_KERNEL": rf"REGISTER_OP_CUDA_KERNEL\(\s*{op}\s*,", } for k, p in patterns.items(): matches = re.findall(p, content, flags=re.DOTALL) diff --git a/tools/sampcd_processor_utils.py b/tools/sampcd_processor_utils.py index ff6de2b598326..aaf61fcd88dc0 100644 --- a/tools/sampcd_processor_utils.py +++ b/tools/sampcd_processor_utils.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + import argparse import inspect import logging @@ -48,6 +50,12 @@ API_DIFF_SPEC_FN = 'dev_pr_diff_api.spec' TEST_TIMEOUT = 10 +PAT_API_SPEC_MEMBER = re.compile(r'\((paddle[^,]+)\W*document\W*([0-9a-z]{32})') +# insert ArgSpec for changing the API's type annotation can trigger the CI +PAT_API_SPEC_SIGNATURE = re.compile( + r'^(paddle[^,]+)\s+\((ArgSpec.*),.*document\W*([0-9a-z]{32})' +) + class Result: # name/key for result @@ -66,7 +74,7 @@ class Result: order: int = 0 @classmethod - def msg(cls, count: int, env: typing.Set) -> str: + def msg(cls, count: int, env: set) -> str: """Message for logging with api `count` and running `env`.""" raise NotImplementedError @@ -85,8 +93,8 @@ class MetaResult(type): def __new__( mcs, name: str, - bases: typing.Tuple[type, ...], - namespace: typing.Dict[str, typing.Any], + bases: tuple[type, ...], + namespace: dict[str, typing.Any], ) -> type: cls = super().__new__(mcs, name, bases, namespace) if issubclass(cls, Result): @@ -104,7 +112,7 @@ def get(mcs, name: str) -> type: return mcs.__cls_map.get(name) @classmethod - def cls_map(mcs) -> typing.Dict[str, Result]: + def cls_map(mcs) -> dict[str, Result]: return mcs.__cls_map @@ -290,7 +298,7 @@ def prepare(self, test_capacity: set) -> None: """ pass - def run(self, api_name: str, docstring: str) -> typing.List[TestResult]: + def run(self, api_name: str, docstring: str) -> list[TestResult]: """Extract codeblocks from docstring, and run the test. Run only one docstring at a time. @@ -304,7 +312,7 @@ def run(self, api_name: str, docstring: str) -> typing.List[TestResult]: raise NotImplementedError def print_summary( - self, test_results: typing.List[TestResult], whl_error: typing.List[str] + self, test_results: list[TestResult], whl_error: list[str] ) -> None: """Post process test results and print test summary. @@ -333,17 +341,17 @@ def get_api_md5(path): API_spec = os.path.abspath(os.path.join(os.getcwd(), "..", path)) if not os.path.isfile(API_spec): return api_md5 - pat = re.compile(r'\((paddle[^,]+)\W*document\W*([0-9a-z]{32})') - patArgSpec = re.compile( - r'^(paddle[^,]+)\s+\(ArgSpec.*document\W*([0-9a-z]{32})' - ) + with open(API_spec) as f: for line in f.readlines(): - mo = pat.search(line) - if not mo: - mo = patArgSpec.search(line) + mo = PAT_API_SPEC_MEMBER.search(line) + if mo: api_md5[mo.group(1)] = mo.group(2) + else: + mo = PAT_API_SPEC_SIGNATURE.search(line) + api_md5[mo.group(1)] = f'{mo.group(2)}, {mo.group(3)}' + return api_md5 @@ -397,18 +405,6 @@ def get_full_api_from_pr_spec(): get_full_api_by_walk() -def get_full_api(): - """ - get all the apis - """ - global API_DIFF_SPEC_FN # readonly - from print_signatures import get_all_api_from_modulelist - - member_dict = get_all_api_from_modulelist() - with open(API_DIFF_SPEC_FN, 'w') as f: - f.write("\n".join(member_dict.keys())) - - def extract_code_blocks_from_docstr(docstr, google_style=True): """ extract code-blocks from the given docstring. @@ -599,9 +595,16 @@ def get_test_capacity(run_on_device="cpu"): return sample_code_test_capacity -def get_docstring(full_test=False): +def get_docstring( + full_test: bool = False, + filter_api: typing.Callable[[str], bool] | None = None, +): ''' this function will get the docstring for test. + + Args: + full_test, get all api + filter_api, a function that filter api, if `True` then skip add to `docstrings_to_test`. ''' import paddle import paddle.static.quantization # noqa: F401 @@ -616,6 +619,9 @@ def get_docstring(full_test=False): with open(API_DIFF_SPEC_FN) as f: for line in f.readlines(): api = line.replace('\n', '') + if filter_api is not None and filter_api(api.strip()): + continue + try: api_obj = eval(api) except AttributeError: @@ -637,7 +643,7 @@ def get_docstring(full_test=False): return docstrings_to_test, whl_error -def check_old_style(docstrings_to_test: typing.Dict[str, str]): +def check_old_style(docstrings_to_test: dict[str, str]): old_style_apis = [] for api_name, raw_docstring in docstrings_to_test.items(): for codeblock in extract_code_blocks_from_docstr( @@ -715,8 +721,8 @@ def exec_gen_doc(): def get_test_results( - doctester: DocTester, docstrings_to_test: typing.Dict[str, str] -) -> typing.List[TestResult]: + doctester: DocTester, docstrings_to_test: dict[str, str] +) -> list[TestResult]: """Get test results from doctester with docstrings to test.""" _test_style = ( doctester.style diff --git a/tools/statistics_UT_resource.sh b/tools/statistics_UT_resource.sh index a6f1f264c4cd2..f97fc6f0dc51d 100644 --- a/tools/statistics_UT_resource.sh +++ b/tools/statistics_UT_resource.sh @@ -1,11 +1,11 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. diff --git a/tools/test_print_signatures.py b/tools/test_print_signatures.py index 8a3bc60dcf9a7..20345d77b2566 100644 --- a/tools/test_print_signatures.py +++ b/tools/test_print_signatures.py @@ -25,7 +25,7 @@ import hashlib import unittest -from print_signatures import is_primitive, md5 +from print_signatures import md5 def func_example(param_a, param_b): @@ -62,26 +62,5 @@ def test_md5(self): self.assertEqual(digest, md5(func_example.__doc__)) -class Test_is_primitive(unittest.TestCase): - def test_single(self): - self.assertTrue(is_primitive(2)) - self.assertTrue(is_primitive(2.1)) - self.assertTrue(is_primitive("2.1.1")) - self.assertFalse(is_primitive(b"hello paddle")) - self.assertFalse(is_primitive(1j)) - self.assertTrue(is_primitive(True)) - - def test_collection(self): - self.assertTrue(is_primitive([])) - self.assertTrue(is_primitive(())) - self.assertTrue(is_primitive(set())) - self.assertTrue(is_primitive([1, 2])) - self.assertTrue(is_primitive((1.1, 2.2))) - self.assertTrue(is_primitive({1, 2.3})) - self.assertFalse(is_primitive(range(3))) - self.assertFalse(is_primitive({})) - self.assertFalse(is_primitive([1, 1j])) - - if __name__ == '__main__': unittest.main() diff --git a/tools/test_sampcd_processor.py b/tools/test_sampcd_processor.py index 62c51a73ba8a7..c61c7e610f98c 100644 --- a/tools/test_sampcd_processor.py +++ b/tools/test_sampcd_processor.py @@ -103,19 +103,23 @@ def tearDown(self): def test_get_api_md5(self): res = get_api_md5('paddle/fluid/API_PR.spec') self.assertEqual( - "ff0f188c95030158cc6398d2a6c55one", res['paddle.one_plus_one'] + "ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ff0f188c95030158cc6398d2a6c55one", + res['paddle.one_plus_one'], ) self.assertEqual( - "ff0f188c95030158cc6398d2a6c55two", res['paddle.two_plus_two'] + "ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ff0f188c95030158cc6398d2a6c55two", + res['paddle.two_plus_two'], ) self.assertEqual( - "ff0f188c95030158cc6398d2a6cthree", res['paddle.three_plus_three'] + "ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ff0f188c95030158cc6398d2a6cthree", + res['paddle.three_plus_three'], ) self.assertEqual( "ff0f188c95030158cc6398d2a6c5four", res['paddle.four_plus_four'] ) self.assertEqual( - "ff0f188c95030158cc6398d2a6c5five", res['paddle.five_plus_five'] + "ArgSpec(), ff0f188c95030158cc6398d2a6c5five", + res['paddle.five_plus_five'], ) @@ -302,8 +306,8 @@ def test_global_exec(self): >>> import paddle >>> a = paddle.to_tensor(.2) >>> print(a) - Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True, - [0.20000000]) + Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True, + 0.20000000) """, 'set_default': """ placeholder @@ -319,8 +323,8 @@ def test_global_exec(self): >>> paddle.set_default_dtype('float64') >>> a = paddle.to_tensor(.2) >>> print(a) - Tensor(shape=[1], dtype=float64, place=Place(cpu), stop_gradient=True, - [0.20000000]) + Tensor(shape=[], dtype=float64, place=Place(cpu), stop_gradient=True, + 0.20000000) """, 'after_set_default': """ placeholder @@ -335,8 +339,8 @@ def test_global_exec(self): >>> import paddle >>> a = paddle.to_tensor(.2) >>> print(a) - Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True, - [0.20000000]) + Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True, + 0.20000000) """, } @@ -509,10 +513,10 @@ def test_patch_xdoctest(self): >>> import paddle >>> paddle.device.set_device('gpu') >>> a = paddle.to_tensor(.2) - >>> # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True, [0.20000000]) + >>> # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True, 0.20000000) >>> print(a) - Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True, - [0.20000000]) + Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True, + 0.20000000) """, 'cpu_to_cpu': """ @@ -528,10 +532,10 @@ def test_patch_xdoctest(self): >>> import paddle >>> paddle.device.set_device('cpu') >>> a = paddle.to_tensor(.2) - >>> # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True, [0.20000000]) + >>> # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True, 0.20000000) >>> print(a) - Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True, - [0.20000000]) + Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True, + 0.20000000) """, 'gpu_to_cpu': """ @@ -547,10 +551,10 @@ def test_patch_xdoctest(self): >>> import paddle >>> paddle.device.set_device('gpu') >>> a = paddle.to_tensor(.2) - >>> # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True, [0.20000000]) + >>> # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True, 0.20000000) >>> print(a) - Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True, - [0.20000000]) + Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True, + 0.20000000) """, 'cpu_to_gpu': """ @@ -566,10 +570,10 @@ def test_patch_xdoctest(self): >>> import paddle >>> paddle.device.set_device('cpu') >>> a = paddle.to_tensor(.2) - >>> # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True, [0.20000000]) + >>> # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True, 0.20000000) >>> print(a) - Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True, - [0.20000000]) + Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True, + 0.20000000) """, 'gpu_to_cpu_array': """ placeholder @@ -701,8 +705,8 @@ def test_patch_xdoctest(self): >>> paddle.device.set_device('gpu') >>> a = paddle.to_tensor(.123456789) >>> print(a) - Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True, - [0.123456780]) + Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True, + 0.123456780) """, 'cpu_to_cpu': """ @@ -719,8 +723,8 @@ def test_patch_xdoctest(self): >>> paddle.device.set_device('cpu') >>> a = paddle.to_tensor(.123456789) >>> print(a) - Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True, - [0.123456780]) + Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True, + 0.123456780) """, 'gpu_to_cpu': """ @@ -737,8 +741,8 @@ def test_patch_xdoctest(self): >>> paddle.device.set_device('gpu') >>> a = paddle.to_tensor(.123456789) >>> print(a) - Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True, - [0.123456780]) + Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True, + 0.123456780) """, 'cpu_to_gpu': """ @@ -755,8 +759,8 @@ def test_patch_xdoctest(self): >>> paddle.device.set_device('cpu') >>> a = paddle.to_tensor(.123456789) >>> print(a) - Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True, - [0.123456780]) + Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True, + 0.123456780) """, 'gpu_to_cpu_array': """ placeholder @@ -2046,7 +2050,7 @@ def test_timeout(self): def test_bad_statements(self): docstrings_to_test = { - 'bad_fluid': """ + 'good_fluid': """ this is docstring... Examples: @@ -2191,9 +2195,9 @@ def test_bad_statements(self): tr_10, ) = test_results - self.assertIn('bad_fluid', tr_0.name) - self.assertTrue(tr_0.badstatement) - self.assertFalse(tr_0.passed) + self.assertIn('good_fluid', tr_0.name) + self.assertFalse(tr_0.badstatement) + self.assertTrue(tr_0.passed) self.assertIn('bad_fluid_from', tr_1.name) self.assertTrue(tr_1.badstatement) diff --git a/tools/test_type_checking.py b/tools/test_type_checking.py new file mode 100644 index 0000000000000..714be765ca9b5 --- /dev/null +++ b/tools/test_type_checking.py @@ -0,0 +1,630 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from tools.type_checking import MypyChecker, get_test_results + + +class TestMypyChecker(unittest.TestCase): + def test_mypy_pass(self): + docstrings_pass = { + 'simple': """ + placeholder + + Examples: + + .. code-block:: python + :name: code-example-1 + + this is some blabla... + + >>> import abc + >>> print(1) + 1 + """, + 'multi': """ + placeholder + + .. code-block:: python + :name: code-example-0 + + this is some blabla... + + >>> # doctest: +SKIP('skip') + >>> print(1+1) + 2 + + Examples: + + .. code-block:: python + :name: code-example-1 + + this is some blabla... + + >>> # doctest: -REQUIRES(env:GPU) + >>> print(1-1) + 0 + + .. code-block:: python + :name: code-example-2 + + this is some blabla... + + >>> # doctest: +REQUIRES(env:GPU, env:XPU, env: DISTRIBUTED) + >>> print(1-1) + 0 + """, + } + docstrings_from_sampcd = { + 'gpu_to_gpu': """ + placeholder + + Examples: + + .. code-block:: python + :name: code-example-1 + + this is some blabla... + + >>> import paddle + >>> paddle.device.set_device('gpu') + >>> a = paddle.to_tensor(.123456789) + >>> print(a) + Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True, + [0.123456780]) + + """, + 'cpu_to_cpu': """ + placeholder + + Examples: + + .. code-block:: python + :name: code-example-1 + + this is some blabla... + + >>> import paddle + >>> paddle.device.set_device('cpu') + >>> a = paddle.to_tensor(.123456789) + >>> print(a) + Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True, + [0.123456780]) + + """, + 'gpu_to_cpu': """ + placeholder + + Examples: + + .. code-block:: python + :name: code-example-1 + + this is some blabla... + + >>> import paddle + >>> paddle.device.set_device('gpu') + >>> a = paddle.to_tensor(.123456789) + >>> print(a) + Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True, + [0.123456780]) + + """, + 'cpu_to_gpu': """ + placeholder + + Examples: + + .. code-block:: python + :name: code-example-1 + + this is some blabla... + + >>> import paddle + >>> paddle.device.set_device('cpu') + >>> a = paddle.to_tensor(.123456789) + >>> print(a) + Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True, + [0.123456780]) + """, + 'gpu_to_cpu_array': """ + placeholder + + Examples: + + .. code-block:: python + :name: code-example-1 + + this is some blabla... + + >>> import paddle + >>> paddle.device.set_device('gpu') + >>> a = paddle.to_tensor([[1.123456789 ,2,3], [2,3,4], [3,4,5]]) + >>> print(a) + Tensor(shape=[3, 3], dtype=float32, place=Place(cpu), stop_gradient=True, + [[1.123456780, 2., 3.], + [2., 3., 4.], + [3., 4., 5.]]) + """, + 'cpu_to_gpu_array': """ + placeholder + + Examples: + + .. code-block:: python + :name: code-example-1 + + this is some blabla... + + >>> import paddle + >>> paddle.device.set_device('cpu') + >>> a = paddle.to_tensor([[1.123456789,2,3], [2,3,4], [3,4,5]]) + >>> print(a) + Tensor(shape=[3, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True, + [[1.123456780, 2., 3.], + [2., 3., 4.], + [3., 4., 5.]]) + """, + 'mass_array': """ + placeholder + + Examples: + + .. code-block:: python + :name: code-example-1 + + this is some blabla... + + >>> import paddle + >>> paddle.device.set_device('gpu') + >>> a = paddle.to_tensor( + ... [[1.123456780, 2., -3, .3], + ... [2, 3, +4., 1.2+10.34e-5j], + ... [3, 5.e-3, 1e2, 3e-8]] + ... ) + >>> # Tensor(shape=[3, 4], dtype=complex64, place=Place(gpu:0), stop_gradient=True, + >>> # [[ (1.1234568357467651+0j) , + >>> # (2+0j) , + >>> # (-3+0j) , + >>> # (0.30000001192092896+0j) ], + >>> # [ (2+0j) , + >>> # (3+0j) , + >>> # (4+0j) , + >>> # (1.2000000476837158+0.00010340000153519213j)], + >>> # [ (3+0j) , + >>> # (0.004999999888241291+0j) , + >>> # (100+0j) , + >>> # (2.999999892949745e-08+0j) ]]) + >>> print(a) + Tensor(shape=[3, 4], dtype=complex64, place=Place(AAA), stop_gradient=True, + [[ (1.123456+0j), + (2+0j), + (-3+0j), + (0.3+0j)], + [ (2+0j), + (3+0j), + (4+0j), + (1.2+0.00010340j)], + [ (3+0j), + (0.00499999+0j), + (100+0j), + (2.999999e-08+0j)]]) + """, + 'float_array': """ + placeholder + + Examples: + + .. code-block:: python + :name: code-example-1 + + this is some blabla... + + >>> import paddle + >>> paddle.device.set_device('cpu') + >>> x = [[2, 3, 4], [7, 8, 9]] + >>> x = paddle.to_tensor(x, dtype='float32') + >>> print(paddle.log(x)) + Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True, + [[0.69314718, 1.09861231, 1.38629436], + [1.94591010, 2.07944155, 2.19722462]]) + + """, + 'float_array_diff': """ + placeholder + + Examples: + + .. code-block:: python + :name: code-example-1 + + this is some blabla... + + >>> import paddle + >>> paddle.device.set_device('cpu') + >>> x = [[2, 3, 4], [7, 8, 9]] + >>> x = paddle.to_tensor(x, dtype='float32') + >>> print(paddle.log(x)) + Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True, + [[0.69314712, 1.09861221, 1.386294], + [1.94591032, 2.07944156, 2.1972246]]) + + """, + 'float_begin': """ + placeholder + + Examples: + + .. code-block:: python + :name: code-example-1 + + this is some blabla... + + >>> print(7.0) + 7. + + """, + 'float_begin_long': """ + placeholder + + Examples: + + .. code-block:: python + :name: code-example-1 + + this is some blabla... + + >>> print(7.0000023) + 7.0000024 + + """, + 'float_begin_more': """ + placeholder + + Examples: + + .. code-block:: python + :name: code-example-1 + + this is some blabla... + + >>> print(7.0, 5., 6.123456) + 7.0 5.0 6.123457 + + """, + 'float_begin_more_diff': """ + placeholder + + Examples: + + .. code-block:: python + :name: code-example-1 + + this is some blabla... + + >>> print(7.0, 5., 6.123456) + 7.0 5.0 6.123457 + + """, + 'float_begin_more_brief': """ + placeholder + + Examples: + + .. code-block:: python + :name: code-example-1 + + this is some blabla... + + >>> print(7.0, 5., 6.123456) + 7. 5. 6.123457 + + """, + 'float_begin_fail': """ + placeholder + + Examples: + + .. code-block:: python + :name: code-example-1 + + this is some blabla... + + >>> print(7.0100023) + 7.0000024 + + """, + } + doctester = MypyChecker('../pyproject.toml') + + test_results = get_test_results(doctester, docstrings_pass) + self.assertEqual(len(test_results), 3) + + for tr in test_results: + self.assertFalse(tr.fail) + + test_results = get_test_results(doctester, docstrings_from_sampcd) + self.assertEqual(len(test_results), 15) + + for tr in test_results: + print(tr.msg) + self.assertFalse(tr.fail) + + def test_mypy_fail(self): + docstrings_fail = { + 'fail_simple': """ + placeholder + + Examples: + + .. code-block:: python + :name: code-example-1 + + this is some blabla... + + >>> import blabla + """, + 'multi': """ + placeholder + + .. code-block:: python + :name: code-example-0 + + this is some blabla... + + >>> # doctest: +SKIP('skip') + >>> print(1+1) + 2 + + Examples: + + .. code-block:: python + :name: code-example-1 + + this is some blabla... + + >>> # doctest: -REQUIRES(env:GPU) + >>> blabla + >>> print(1-1) + 0 + + .. code-block:: python + :name: code-example-2 + + this is some blabla... + + >>> # doctest: +REQUIRES(env:GPU, env:XPU, env: DISTRIBUTED) + >>> blabla + >>> print(1-1) + 0 + """, + } + + doctester = MypyChecker('../pyproject.toml') + + test_results = get_test_results(doctester, docstrings_fail) + self.assertEqual(len(test_results), 3) + + for tr in test_results: + self.assertTrue(tr.fail) + + def test_mypy_partial_fail(self): + docstrings_fail = { + 'multi': """ + placeholder + + .. code-block:: python + :name: code-example-0 + + this is some blabla... + + >>> # doctest: +SKIP('skip') + >>> print(1+1) + 2 + + Examples: + + .. code-block:: python + :name: code-example-1 + + this is some blabla... + + >>> # doctest: -REQUIRES(env:GPU) + >>> blabla + >>> print(1-1) + 0 + + .. code-block:: python + :name: code-example-2 + + this is some blabla... + + >>> # doctest: +REQUIRES(env:GPU, env:XPU, env: DISTRIBUTED) + >>> print(1-1) + 0 + """ + } + + doctester = MypyChecker('../pyproject.toml') + + test_results = get_test_results(doctester, docstrings_fail) + self.assertEqual(len(test_results), 2) + + tr_0, tr_1 = test_results + self.assertTrue(tr_0.fail) + self.assertFalse(tr_1.fail) + + def test_mypy_ignore(self): + docstrings_ignore = { + 'fail_simple': """ + placeholder + + Examples: + + .. code-block:: python + :name: code-example-1 + + this is some blabla... + + >>> # type: ignore + >>> import blabla + """, + 'multi': """ + placeholder + + .. code-block:: python + :name: code-example-0 + + this is some blabla... + + >>> # doctest: +SKIP('skip') + >>> print(1+1) + 2 + + Examples: + + .. code-block:: python + :name: code-example-1 + + this is some blabla... + + >>> # type: ignore + >>> # doctest: -REQUIRES(env:GPU) + >>> blabla + >>> print(1-1) + 0 + + .. code-block:: python + :name: code-example-2 + + this is some blabla... + + >>> # type: ignore + >>> # doctest: +REQUIRES(env:GPU, env:XPU, env: DISTRIBUTED) + >>> blabla + >>> print(1-1) + 0 + """, + } + + doctester = MypyChecker('../pyproject.toml') + + test_results = get_test_results(doctester, docstrings_ignore) + self.assertEqual(len(test_results), 3) + + for tr in test_results: + print(tr.msg) + self.assertFalse(tr.fail) + + docstrings_pass = { + 'pass': """ + placeholder + + .. code-block:: python + :name: code-example-0 + + this is some blabla... + + >>> # doctest: +SKIP('skip') + >>> print(1+1) + 2 + + Examples: + + .. code-block:: python + :name: code-example-1 + + this is some blabla... + + >>> a = 1 + >>> # type: ignore + >>> # doctest: -REQUIRES(env:GPU) + >>> blabla + >>> print(1-1) + 0 + + .. code-block:: python + :name: code-example-2 + + this is some blabla... + + >>> b = 2 + >>> # type: ignore + >>> # doctest: +REQUIRES(env:GPU, env:XPU, env: DISTRIBUTED) + >>> blabla + >>> print(1-1) + 0 + """, + } + + doctester = MypyChecker('../pyproject.toml') + + test_results = get_test_results(doctester, docstrings_pass) + self.assertEqual(len(test_results), 2) + + for tr in test_results: + print(tr.msg) + self.assertFalse(tr.fail) + + docstrings_fail = { + 'fail': """ + placeholder + + .. code-block:: python + :name: code-example-0 + + this is some blabla... + + >>> # doctest: +SKIP('skip') + >>> print(1+1) + 2 + + Examples: + + .. code-block:: python + :name: code-example-1 + + this is some blabla... + + >>> import blabla + >>> a = 1 + >>> # type: ignore + >>> # doctest: -REQUIRES(env:GPU) + >>> blabla + >>> print(1-1) + 0 + + .. code-block:: python + :name: code-example-2 + + this is some blabla... + + >>> import blabla + >>> # type: ignore + >>> # doctest: +REQUIRES(env:GPU, env:XPU, env: DISTRIBUTED) + >>> blabla + >>> print(1-1) + 0 + """, + } + + doctester = MypyChecker('../pyproject.toml') + + test_results = get_test_results(doctester, docstrings_fail) + self.assertEqual(len(test_results), 2) + + for tr in test_results: + print(tr.msg) + self.assertTrue(tr.fail) diff --git a/tools/timeline.py b/tools/timeline.py index ff8d0946378d7..5e16e0b9bf4f3 100644 --- a/tools/timeline.py +++ b/tools/timeline.py @@ -148,7 +148,7 @@ def _allocate_pids(self): self._devices[(k, event.device_id, "CPU")] = pid # -1 device id represents CUDA API(RunTime) call.(e.g. cudaLaunch, cudaMemcpy) if event.device_id == -1: - self._chrome_trace.emit_pid("%s:cuda_api" % k, pid) + self._chrome_trace.emit_pid(f"{k}:cuda_api", pid) else: self._chrome_trace.emit_pid( "%s:cpu:block:%d" % (k, event.device_id), pid diff --git a/tools/timeout_debug_help.sh b/tools/timeout_debug_help.sh index 45de2db87e853..fcc6d473e49eb 100644 --- a/tools/timeout_debug_help.sh +++ b/tools/timeout_debug_help.sh @@ -1,13 +1,13 @@ #!/bin/bash # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -17,7 +17,7 @@ set +e failed_uts=$1 need_debug_ut_re='test_dist_fleet' cat_log_judge=$(echo "${failed_uts}" | grep 'Timeout' | grep -oEi "$need_debug_ut_re" ) -if [[ "$cat_log_judge" != "" ]];then +if [[ "$cat_log_judge" != "" ]];then echo "==============================================" echo "show timeout ut logs" echo "==============================================" diff --git a/tools/type_checking.py b/tools/type_checking.py new file mode 100644 index 0000000000000..78285cb87eaa4 --- /dev/null +++ b/tools/type_checking.py @@ -0,0 +1,276 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# We type-check the `Example` codes from docstring. + +from __future__ import annotations + +import argparse +import doctest +import pathlib +import re +from abc import abstractmethod +from concurrent.futures import ProcessPoolExecutor +from dataclasses import dataclass, field +from typing import Any + +from mypy import api as mypy_api +from sampcd_processor_utils import ( + extract_code_blocks_from_docstr, + get_docstring, + init_logger, + log_exit, + logger, +) + + +class TypeChecker: + style: str = 'google' + + def __init__(self, *args: Any, **kwargs: Any) -> None: + pass + + @abstractmethod + def run(self, api_name: str, codeblock: str) -> TestResult: + pass + + @abstractmethod + def print_summary( + self, test_results: list[TestResult], whl_error: list[str] + ) -> None: + pass + + +@dataclass +class TestResult: + api_name: str + msg: str + fail: bool = False + extra_info: dict[str, Any] = field(default_factory=dict) + + +class MypyChecker(TypeChecker): + def __init__( + self, config_file: str, cache_dir: str, *args: Any, **kwargs: Any + ) -> None: + self.config_file = config_file + self.cache_dir = cache_dir + super().__init__(*args, **kwargs) + + def run(self, api_name: str, codeblock: str) -> TestResult: + # skip checking when the codeblock startswith `>>> # type: ignore` + codeblock_for_checking = [] + for line in codeblock.splitlines(): + if line.strip().startswith('>>> # type: ignore'): + break + codeblock_for_checking.append(line) + codeblock_for_checking = '\n'.join(codeblock_for_checking) + + # remove `doctest` in the codeblock, or the module `doctest` cannot `get_examples`` correctly + codeblock_for_checking = re.sub( + r'#\s*x?doctest\s*:.*', '', codeblock_for_checking + ) + + # `get_examples` codes with `>>>` and `...` stripped + _example_code = doctest.DocTestParser().get_examples( + codeblock_for_checking + ) + example_code = '\n'.join( + [l for e in _example_code for l in e.source.splitlines()] + ) + + normal_report, error_report, exit_status = mypy_api.run( + [ + f'--config-file={self.config_file}', + f'--cache-dir={self.cache_dir}', + '-c', + example_code, + ] + ) + + logger.debug('-' * 20) + logger.debug(f'>>> Type hints with api {api_name} start ...') + logger.debug(example_code) + logger.debug('>>> Results ...') + logger.debug('>>> mypy normal_report is ...') + logger.debug(normal_report) + logger.debug('>>> mypy error_report is ...') + logger.debug(error_report) + logger.debug('>>> mypy exit_status is ...') + logger.debug(exit_status) + logger.debug(f'>>> Type hints with api {api_name} end...') + + return TestResult( + api_name=api_name, + msg='\n'.join([normal_report, error_report]), + fail=exit_status != 0, + extra_info={ + 'normal_report': normal_report, + 'error_report': error_report, + 'exit_status': exit_status, + }, + ) + + def print_summary( + self, test_results: list[TestResult], whl_error: list[str] + ) -> None: + is_fail = False + + logger.warning("----------------Check results--------------------") + + if whl_error is not None and whl_error: + logger.warning("%s is not in whl.", whl_error) + logger.warning("") + logger.warning("Please check the whl package and API_PR.spec!") + logger.warning( + "You can follow these steps in order to generate API.spec:" + ) + logger.warning("1. cd ${paddle_path}, compile paddle;") + logger.warning( + "2. pip install build/python/dist/(build whl package);" + ) + logger.warning( + "3. run 'python tools/print_signatures.py paddle > paddle/fluid/API.spec'." + ) + for test_result in test_results: + if test_result.fail: + logger.error( + ">>> In addition, mistakes found in type checking: %s", + test_result.api_name, + ) + logger.error(test_result.msg) + log_exit(1) + + else: + for test_result in test_results: + if test_result.fail: + is_fail = True + + logger.error(test_result.api_name) + logger.error(test_result.msg) + + else: + logger.debug(test_result.api_name) + logger.debug(test_result.msg) + + if is_fail: + logger.error(">>> Mistakes found in type checking!") + logger.error(">>> Please recheck the type annotations.") + log_exit(1) + + logger.warning(">>> Type checking is successful!") + logger.warning("----------------End of the Check--------------------") + + +def parse_args() -> argparse.Namespace: + """ + Parse input arguments + """ + parser = argparse.ArgumentParser( + description='run Sample Code Type Checking' + ) + parser.add_argument('--debug', dest='debug', action="store_true") + parser.add_argument( + '--logf', dest='logf', type=str, default=None, help='file for logging' + ) + parser.add_argument( + '--config-file', + dest='config_file', + type=str, + default=None, + help='config file for type checker', + ) + parser.add_argument( + '--cache-dir', + dest='cache_dir', + type=str, + default=None, + help='cache dir for mypy', + ) + parser.add_argument('--full-test', dest='full_test', action="store_true") + + args = parser.parse_args() + return args + + +def get_test_results( + type_checker: TypeChecker, docstrings_to_test: dict[str, str] +) -> list[TestResult]: + _test_style = ( + type_checker.style + if type_checker.style in {'google', 'freeform'} + else 'google' + ) + google_style = _test_style == 'google' + + api_names = [] + codeblocks = [] + for api_name, raw_docstring in docstrings_to_test.items(): + # we may extract more than one codeblocks from docsting. + for codeblock in extract_code_blocks_from_docstr( + raw_docstring, google_style=google_style + ): + codeblock_name = codeblock['name'] + codeblock_id = codeblock['id'] + + api_names.append(f'{api_name}:{codeblock_name or codeblock_id}') + codeblocks.append(codeblock['codes']) + + test_results = [] + with ProcessPoolExecutor() as exe: + test_results = exe.map( + type_checker.run, api_names, codeblocks, timeout=600 + ) + + return list(test_results) + + +def run_type_checker( + args: argparse.Namespace, type_checker: TypeChecker +) -> None: + # init logger + init_logger(debug=args.debug, log_file=args.logf) + + logger.info( + "----------------Codeblock Type Checking Start--------------------" + ) + + logger.info(">>> Get docstring from api ...") + filter_api = lambda api_name: 'libpaddle' in api_name + docstrings_to_test, whl_error = get_docstring( + full_test=args.full_test, filter_api=filter_api + ) + + logger.info(">>> Running type checker ...") + test_results = get_test_results(type_checker, docstrings_to_test) + + logger.info(">>> Print summary ...") + type_checker.print_summary(test_results, whl_error) + + +if __name__ == '__main__': + base_path = pathlib.Path(__file__).resolve().parent.parent + + args = parse_args() + mypy_checker = MypyChecker( + config_file=( + args.config_file + if args.config_file + else (base_path / 'pyproject.toml') + ), + cache_dir=( + args.cache_dir if args.cache_dir else (base_path / '.mypy_cache') + ), + ) + run_type_checker(args, mypy_checker) diff --git a/tools/windows/build_compile_environment.bat b/tools/windows/build_compile_environment.bat index 884cea8ca4cd0..016e2a4ff25cb 100644 --- a/tools/windows/build_compile_environment.bat +++ b/tools/windows/build_compile_environment.bat @@ -16,7 +16,7 @@ :: Build Paddle compile environment :: =============================== :: Description: -:: +:: :: Install compile environment for xly CI. :: :: Include: @@ -55,7 +55,7 @@ if %errorlevel% == 0 ( ) else ( echo Error***** Download wget tool failed, please download it before rerun. exit /b 1 -) +) goto :eof :: ===== end step 0: wget tool ===== @@ -296,7 +296,7 @@ goto tensorrt echo There is not sccache in this PC, will install sccache. echo Download package from https://paddle-ci.gz.bcebos.com/window_requirement/sccache.exe wget -O sccache.exe "https://paddle-ci.gz.bcebos.com/window_requirement/sccache.exe" -copy sccache.exe C:\Python38 /Y +copy sccache.exe C:\Python38 /Y goto :eof :: ===== end step 7: sccache on windows ===== diff --git a/tools/windows/check_change_of_unittest.sh b/tools/windows/check_change_of_unittest.sh index 576f0e5d238ab..25073435e3fb2 100644 --- a/tools/windows/check_change_of_unittest.sh +++ b/tools/windows/check_change_of_unittest.sh @@ -19,7 +19,7 @@ GITHUB_API_TOKEN=$GITHUB_API_TOKEN GIT_PR_ID=$AGILE_PULL_ID BRANCH=$BRANCH if [ "${GITHUB_API_TOKEN}" == "" ] || [ "${GIT_PR_ID}" == "" ];then - exit 0 + exit 0 fi unittest_spec_diff=$(cat $PADDLE_ROOT/deleted_ut | sed 's/^/ - /g') diff --git a/tools/xpu/get_xpti_dependence.sh b/tools/xpu/get_xpti_dependence.sh index 95cc4a110ed6d..6801990933d76 100644 --- a/tools/xpu/get_xpti_dependence.sh +++ b/tools/xpu/get_xpti_dependence.sh @@ -1,13 +1,13 @@ #!/bin/bash # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.