Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…

… nested-namespace-part-6
PaddlePaddle · Jun 4, 2024 · 09ac2c1 · 09ac2c1
2 parents 7deb15f + 4cb694e
commit 09ac2c1
Show file tree

Hide file tree

Showing 751 changed files with 12,989 additions and 8,102 deletions.
diff --git a/.clang-format b/.clang-format
@@ -6,11 +6,11 @@
 # The basic usage is,
 #   clang-format -i -style=file PATH/TO/SOURCE/CODE
 #
-# The -style=file implicit use ".clang-format" file located in one of 
-# parent directory. 
+# The -style=file implicit use ".clang-format" file located in one of
+# parent directory.
 # The -i means inplace change.
 #
-# The document of clang-format is 
+# The document of clang-format is
 #   http://clang.llvm.org/docs/ClangFormat.html
 #   http://clang.llvm.org/docs/ClangFormatStyleOptions.html
 ---
@@ -20,7 +20,7 @@ IndentWidth:     2
 TabWidth:        2
 ContinuationIndentWidth: 4
 AccessModifierOffset: -1  # The private/protected/public has no indent in class
-Standard:  Cpp11 
+Standard:  Cpp11
 AllowAllParametersOfDeclarationOnNextLine: true
 BinPackParameters: false
 BinPackArguments: false

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -53,7 +53,6 @@ python/paddle/base/compiler.py @XiaoguangHu01 @zhiqiu @Xreki @qili93 @Aurelius84
 python/paddle/base/dygraph/layers.py @JiabinYang @phlrain
 python/paddle/base/framework.py @XiaoguangHu01 @zhiqiu @Xreki @qili93 @Aurelius84
 python/paddle/base/__init__.py @phlrain @Aurelius84 @qili93
-python/paddle/base/parallel_executor.py @Xreki @zhhsplendid @Aurelius84
 python/paddle/base/tests/unittests/white_list/check_op_sequence_batch_1_input_white_list.py @Aurelius84 @phlrain
 python/paddle/base/tests/unittests/white_list/check_op_sequence_instance_0_input_white_list.py @Aurelius84 @phlrain
 python/paddle/base/tests/unittests/white_list/check_shape_white_list.py @hong19860320 @Aurelius84 @phlrain

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -99,6 +99,9 @@ if(WITH_GPU AND WITH_ROCM)
 endif()
 
 if(WITH_GPU AND NOT APPLE)
+  if(WITH_PIP_CUDA_LIBRARIES AND CMAKE_SYSTEM_NAME STREQUAL "Windows")
+    add_definitions(-DPADDLE_WITH_PIP_CUDA_LIBRARIES)
+  endif()
   #(Note risemeup1): The cudart dynamic library libcudart.so is used by set CUDA_USE_STATIC_CUDA_RUNTIME and CMAKE_CUDA_FLAGS
   if(CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL
                                             "x86_64")
@@ -107,8 +110,8 @@ if(WITH_GPU AND NOT APPLE)
         CACHE BOOL "" FORCE)
     set(CMAKE_CUDA_FLAGS "--cudart shared")
     if(WITH_PIP_CUDA_LIBRARIES)
-      #(Note risemeup1): Flag 'WITH_PIP_CUDA_LIBRARIES' will be used in dynamic_loader.cc to search for CUDA-related .so files through the Python libraries provided by NVIDIA.
-      add_definitions(-DWITH_PIP_CUDA_LIBRARIES)
+      #(Note risemeup1): Flag 'PADDLE_WITH_PIP_CUDA_LIBRARIES' will be used in dynamic_loader.cc to search for CUDA-related .so files through the Python libraries provided by NVIDIA.
+      add_definitions(-DPADDLE_WITH_PIP_CUDA_LIBRARIES)
     endif()
   endif()
   enable_language(CUDA)

diff --git a/cmake/PaddleConfig.cmake.in b/cmake/PaddleConfig.cmake.in
@@ -12,7 +12,7 @@
 get_filename_component(PADDLE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_FILE}/../.." ABSOLUTE)
 
 # include directories
-set(PADDLE_INCLUDE_DIRS 
+set(PADDLE_INCLUDE_DIRS
     ${PADDLE_INSTALL_PREFIX}/include
     ${PADDLE_INSTALL_PREFIX}/include/third_party
 )

diff --git a/cmake/make_resource.py b/cmake/make_resource.py
@@ -24,7 +24,7 @@
     "const unsigned char "
     + var
     + "[] = {"
-    + ",".join(["0x%02x" % ord(c) for c in open(res).read()])
+    + ",".join([f"0x{ord(c):02x}" for c in open(res).read()])
     + ",0};\n"
     + "const unsigned "
     + var

diff --git a/paddle/cinn/ast_gen_ius/ast_gen.cc b/paddle/cinn/ast_gen_ius/ast_gen.cc
@@ -131,6 +131,7 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
       } else {
         iter_values.push_back(axis_vars[i]);
       }
+      ir::TryElevateInt32ToInt64({ir::Expr(axis_vars[i]), shape[i]});
     }
     VLOG(4) << "iter_value.size() and block_vars.size() is "
             << iter_values.size() << " " << block_vars.size();
@@ -167,6 +168,7 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
       } else {
         reduce_iter_values.push_back(axis_vars[i]);
       }
+      ir::TryElevateInt32ToInt64({ir::Expr(axis_vars[i]), shape[i]});
     }
     VLOG(4) << "ast gen: reduce body is after replace 0" << reduce_body;
     for (int i = 0; i < reduce_axis.size(); ++i) {
@@ -227,6 +229,9 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
         ir::ScheduleBlock::Make(
             reduce_block_vars, {}, {}, tensor->name, reduce_body));
     for (int i = static_cast<int>(reduce_axis.size()) - 1; i >= 0; --i) {
+      ir::TryElevateInt32ToInt64({reduce_axis[i],
+                                  reduce_axis[i]->lower_bound,
+                                  reduce_axis[i]->upper_bound});
       reduce_body = ir::For::Make(reduce_axis[i],
                                   reduce_axis[i]->lower_bound,
                                   reduce_axis[i]->upper_bound,

diff --git a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
@@ -32,7 +32,7 @@
 #include "paddle/cinn/lang/lower.h"
 #include "paddle/cinn/optim/optimize.h"
 #include "paddle/cinn/optim/transform_gpu_forloop.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -193,10 +193,14 @@ ir::LoweredFunc UpdateFuncWithNewBody(const cinn::common::Target& target,
 std::unordered_set<std::string> GetReduceLoopVarNames(const ir::Expr block) {
   const ir::ScheduleBlockRealize* block_realize =
       block.As<ir::ScheduleBlockRealize>();
-  CHECK_NOTNULL(block_realize);
+  PADDLE_ENFORCE_NOT_NULL(
+      block_realize,
+      phi::errors::InvalidArgument("The block is not a ScheduleBlockRealize"));
   const ir::ScheduleBlock* block_node =
       block_realize->schedule_block.As<ir::ScheduleBlock>();
-  CHECK_NOTNULL(block_node);
+  PADDLE_ENFORCE_NOT_NULL(
+      block_node,
+      phi::errors::InvalidArgument("The block is not a ScheduleBlock"));
   std::vector<ir::Expr> iter_values = block_realize->iter_values;
   std::vector<ir::Var> iter_vars = block_node->iter_vars;
 
@@ -218,10 +222,14 @@ std::unordered_set<std::string> GetReduceLoopVarNames(const ir::Expr block) {
 std::string GetBlockName(const ir::Expr block) {
   const ir::ScheduleBlockRealize* block_realize =
       block.As<ir::ScheduleBlockRealize>();
-  CHECK_NOTNULL(block_realize);
+  PADDLE_ENFORCE_NOT_NULL(
+      block_realize,
+      phi::errors::InvalidArgument("The block is not a ScheduleBlockRealize"));
   const ir::ScheduleBlock* block_node =
       block_realize->schedule_block.As<ir::ScheduleBlock>();
-  CHECK_NOTNULL(block_node);
+  PADDLE_ENFORCE_NOT_NULL(
+      block_node,
+      phi::errors::InvalidArgument("The block is not a ScheduleBlock"));
   return block_node->name;
 }
 

diff --git a/paddle/cinn/auto_schedule/auto_tuner.cc b/paddle/cinn/auto_schedule/auto_tuner.cc
@@ -34,7 +34,7 @@
 #include "paddle/cinn/hlir/framework/op.h"
 #include "paddle/cinn/hlir/framework/visualize_helper.h"
 #include "paddle/cinn/utils/string.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -144,9 +144,10 @@ void PrintResult(const TuningResult& result) {
 }
 
 TuningResult AutoTuner::Tune(const TuningOptions& options) {
-  CHECK_GT(options.num_tuning_rounds, 0) << "Invalid config";
-  VLOG(3) << "Begin tuning with round num=" << options.num_tuning_rounds
-          << ", tasks size=" << tasks_.size();
+  PADDLE_ENFORCE_GT(options.num_tuning_rounds,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The num_tuning_rounds should be greater than 0."));
 
   TuningResult result;
   result.subgraphs.resize(tasks_.size());

diff --git a/paddle/cinn/auto_schedule/cost_model/expr_cost_model.cc b/paddle/cinn/auto_schedule/cost_model/expr_cost_model.cc
@@ -24,7 +24,7 @@
 #include "paddle/cinn/auto_schedule/search_space/search_state.h"
 #include "paddle/cinn/common/target.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -45,8 +45,10 @@ void ExprCostModel::Train(const std::vector<const ir::ModuleExpr*>& samples,
                           const cinn::common::Target& target) {
   trained_times_.store(1);
   size_t total_size = samples.size();
-  CHECK_EQ(total_size, labels.size())
-      << "Samples must have same size as labels";
+  PADDLE_ENFORCE_EQ(
+      total_size,
+      labels.size(),
+      phi::errors::InvalidArgument("Samples must have same size as labels"));
   std::vector<std::vector<float>> train_feature_numbers(total_size);
   FeatureExtractor extractor;
   for (size_t i = 0; i < total_size; ++i) {
@@ -63,8 +65,10 @@ void ExprCostModel::Update(const std::vector<const ir::ModuleExpr*>& samples,
                            const cinn::common::Target& target) {
   ++trained_times_;
   size_t total_size = samples.size();
-  CHECK_EQ(total_size, labels.size())
-      << "Samples must have same size as labels";
+  PADDLE_ENFORCE_EQ(
+      total_size,
+      labels.size(),
+      phi::errors::InvalidArgument("Samples must have same size as labels"));
   std::vector<std::vector<float>> train_feature_numbers(total_size);
   FeatureExtractor extractor;
   for (size_t i = 0; i < total_size; ++i) {

diff --git a/paddle/cinn/auto_schedule/database/database.cc b/paddle/cinn/auto_schedule/database/database.cc
@@ -22,7 +22,7 @@
 #include "paddle/cinn/auto_schedule/task/task_registry.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
 #include "paddle/cinn/ir/schedule/schedule_desc.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -42,8 +42,10 @@ proto::TuningRecord TuningRecord::ToProto() const {
 
 Database::Database(int capacity_per_task)
     : capacity_per_task_(capacity_per_task) {
-  CHECK_GT(capacity_per_task_, 0)
-      << "capacity_per_task_ should be greater than 0";
+  PADDLE_ENFORCE_GT(capacity_per_task_,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "capacity_per_task_ should be greater than 0"));
 }
 
 std::unique_ptr<Database> Database::Make(const DatabaseConfig& config) {

diff --git a/paddle/cinn/auto_schedule/measure/simple_builder.cc b/paddle/cinn/auto_schedule/measure/simple_builder.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/cinn/auto_schedule/measure/simple_builder.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -25,8 +25,10 @@ SimpleBuilder::SimpleBuilder(hlir::framework::GraphCompiler* graph_compiler)
     : graph_compiler_(graph_compiler) {}
 
 BuildResult SimpleBuilder::Build(const MeasureInput& input) {
-  CHECK_NE(graph_compiler_, static_cast<GraphCompiler*>(nullptr))
-      << "empty handle to GraphCompiler";
+  PADDLE_ENFORCE_NE(
+      graph_compiler_,
+      static_cast<GraphCompiler*>(nullptr),
+      phi::errors::InvalidArgument("empty handle to GraphCompiler"));
   CompilationContext& context = graph_compiler_->GetCompilationContext();
   context.groups.emplace_back(input.task->subgraph);
   context.lowered_funcs.emplace_back(input.lowered_funcs);

diff --git a/paddle/cinn/auto_schedule/measure/simple_runner.cc b/paddle/cinn/auto_schedule/measure/simple_runner.cc
@@ -25,7 +25,7 @@
 #include "paddle/cinn/hlir/framework/buffer.h"
 #include "paddle/cinn/hlir/framework/scope.h"
 #include "paddle/cinn/hlir/framework/tensor.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -76,8 +76,11 @@ static void PopulateRandomValue(const cinn::common::Type& type,
     std::generate_n(
         fmt_ptr, numel, [&engine, &dist]() { return dist(engine); });
   } else {
-    CHECK_EQ(type.bytes(), 8)
-        << "Unsupported type: " << type << ", type.bytes = " << type.bytes();
+    PADDLE_ENFORCE_EQ(
+        type.bytes(),
+        8,
+        phi::errors::Unimplemented("Unsupported type, the type.bytes is %d",
+                                   type.bytes()));
     auto* fmt_ptr = reinterpret_cast<uint8_t*>(raw_ptr);
     std::uniform_int_distribution<uint8_t> dist(
         std::numeric_limits<uint8_t>::min(),
@@ -127,7 +130,12 @@ static std::unordered_set<std::string> ParamsNeedInitWithZero(
       std::vector<int> param_idxs = kInitWithZeroParams.at(node->op()->name);
       const auto& inlinks = node->inlinks_in_order();
       for (int param_idx : param_idxs) {
-        CHECK_GT(inlinks.size(), param_idx);
+        PADDLE_ENFORCE_GT(inlinks.size(),
+                          param_idx,
+                          phi::errors::InvalidArgument(
+                              "The input size of the node is less than the "
+                              "index of the parameter that needs to be "
+                              "initialized to 0"));
         auto& edge = inlinks.at(param_idx);
         std::string param_name =
             edge->source()->as<hlir::framework::NodeData>()->id();
@@ -141,7 +149,10 @@ static std::unordered_set<std::string> ParamsNeedInitWithZero(
 }
 
 SimpleRunner::SimpleRunner(int repeat_times) : repeat_times_(repeat_times) {
-  CHECK_GT(repeat_times_, 0) << "repeat_times can't less than 0";
+  PADDLE_ENFORCE_GT(
+      repeat_times_,
+      0,
+      phi::errors::InvalidArgument("repeat_times should be greater than 0"));
 }
 
 // Prepare execution arguments of all instructions to run, a argument

diff --git a/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.cc b/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.cc
@@ -18,7 +18,7 @@
 #include "paddle/cinn/ir/ir_printer.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
 #include "paddle/cinn/ir/schedule/schedule_desc.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -29,7 +29,10 @@ int ExtractNumThreads(const ir::IRSchedule& ir_schedule,
     if (step.type == "Bind" &&
         step.attrs.find("thread_axis") != step.attrs.end() &&
         absl::get<std::string>(step.attrs.at("thread_axis")) == bind_axis) {
-      CHECK_EQ(step.inputs.at("loop").size(), 1);
+      PADDLE_ENFORCE_EQ(step.inputs.at("loop").size(),
+                        1,
+                        phi::errors::InvalidArgument(
+                            "The loop size of bind step should be 1"));
       return step.inputs.at("loop")[0].As<ir::For>()->extent.as_int32();
     }
   }

diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc
@@ -21,7 +21,7 @@
 #include "paddle/cinn/ir/schedule_block_graph.h"
 #include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_nodes_collector.h"
-
+#include "paddle/common/enforce.h"
 namespace cinn {
 namespace auto_schedule {
 
@@ -40,8 +40,11 @@ bool IsSpatialLoop(const ir::For* for_node) {
         const auto* schedule_block =
             block_realize->schedule_block.As<ir::ScheduleBlock>();
         CHECK(schedule_block) << "schedule_block field is not a ScheduleBlock";
-        CHECK_EQ(block_realize->iter_values.size(),
-                 schedule_block->iter_vars.size());
+        PADDLE_ENFORCE_EQ(
+            block_realize->iter_values.size(),
+            schedule_block->iter_vars.size(),
+            phi::errors::InvalidArgument(
+                "The size of iter_values and iter_vars should be equal."));
         for (int i = 0; i < block_realize->iter_values.size(); ++i) {
           const ir::Var& iter_var = schedule_block->iter_vars[i];
           const ir::Expr& binding = block_realize->iter_values[i];
@@ -93,10 +96,16 @@ void BindGPUIndex(ir::IRSchedule* ir_schedule,
                   int max_blocks,
                   int max_threads_per_block) {
   auto all_loops = ir_schedule->GetLoops(block_name);
-  CHECK_LE(num_loops_to_bind, all_loops.size())
-      << "The number of loops to be bind is greater than size of all_loops";
-  CHECK_GE(num_loops_to_bind, 0)
-      << "The number of loops to be bind should be greater than 0";
+  PADDLE_ENFORCE_LE(
+      num_loops_to_bind,
+      all_loops.size(),
+      phi::errors::InvalidArgument(
+          "The number of loops to be bind is greater than size of all_loops"));
+  PADDLE_ENFORCE_GE(
+      num_loops_to_bind,
+      0,
+      phi::errors::InvalidArgument(
+          "The number of loops to be bind should be greater than 0"));
   // check whether it is the case that threadIdx has been binded but blockIdx
   // not, the threadIdx can only be binded in the first loop after
   // num_loops_to_bind loops because we has excluded other cases in
@@ -130,13 +139,19 @@ void BindGPUIndex(ir::IRSchedule* ir_schedule,
 
   if (extent <= max_blocks * max_threads_per_block) {
     auto splits = ir_schedule->Split(fused_loop, {-1, max_threads_per_block});
-    CHECK_EQ(splits.size(), 2);
+    PADDLE_ENFORCE_EQ(
+        splits.size(),
+        2,
+        phi::errors::InvalidArgument("The size of splits should be 2."));
     ir_schedule->Bind(splits[0], "blockIdx.x");
     ir_schedule->Bind(splits[1], "threadIdx.x");
   } else {
     auto splits =
         ir_schedule->Split(fused_loop, {-1, max_blocks, max_threads_per_block});
-    CHECK_EQ(splits.size(), 3);
+    PADDLE_ENFORCE_EQ(
+        splits.size(),
+        3,
+        phi::errors::InvalidArgument("The size of splits should be 3."));
     ir_schedule->Reorder({splits[1], splits[2], splits[0]});
     all_loops = ir_schedule->GetLoops(block_name);
     ir_schedule->Bind(all_loops[0], "blockIdx.x");
@@ -160,8 +175,11 @@ RuleApplyType AutoBind::Init(ir::IRSchedule* ir_schedule) {
 }
 
 void AutoBind::Apply(int index) {
-  CHECK_LT(index, applicable_schedule_blocks_.size())
-      << "invalid apply index:" << index;
+  PADDLE_ENFORCE_LT(
+      index,
+      applicable_schedule_blocks_.size(),
+      phi::errors::InvalidArgument(
+          "The index should be less than size of applicable_schedule_blocks_"));
   auto applied_block = applicable_schedule_blocks_.at(index);
   auto all_loops = ir_schedule_->GetLoops(applied_block);
   BindGPUIndex(ir_schedule_,