PaddlePaddle · Hongqing-work · Jan 2, 2025 · Dec 30, 2024 · Dec 30, 2024 · LittleHeroZZZX
diff --git a/paddle/cinn/optim/optimize.cc b/paddle/cinn/optim/optimize.cc
@@ -77,7 +77,11 @@ ir::LoweredFunc Optimize(ir::LoweredFunc fn,
           RemoveGpuForLoops(copied);
         }
         CudaSyncThreadsDropIfThenElse(copied);
-    // CudaTransBufferWithDynamicShape(&copied);
+        FuncPassManager func_pass_manager;
+        VLOG(10) << "Before Optimize TransBufferWithDynamicShape:" << copied;
+        func_pass_manager.AddPass(CreateTransBufferWithDynamicShapePass());
+        func_pass_manager.Run(copied);
+        VLOG(10) << "After Optimize TransBufferWithDynamicShape:" << copied;
 #endif
       },
       [&](common::HygonDCUArchHIP) {

diff --git a/paddle/cinn/optim/trans_buffer_with_dynamic_shape.cc b/paddle/cinn/optim/trans_buffer_with_dynamic_shape.cc
@@ -35,24 +35,29 @@ namespace {
 common::cas_intervals_t var_intervals = {};
 cinn::common::SymbolicExprAnalyzer analyzer(var_intervals);
 
-struct Mutator : public ir::IRMutator<> {
+struct Mutator : public ir::IRMutator<>, public ir::stmt::StmtMutator<> {
   using ir::IRMutator<>::Visit;
 
   Mutator() : shared_mem_size_used_(0) {}
 
+  void operator()(ir::stmt::BlockRef block) { VisitBlock(block); }
+
+  size_t shared_mem_size_used() const { return shared_mem_size_used_; }
+
+ private:
   void Visit(const ir::_Tensor_* tensor, Expr* expr) override {
     if (!tensor->buffer.defined()) return;
     auto buf = tensor->buffer.As<ir::_Buffer_>();
     if (!visited_buf_.count(buf->name)) {
       visited_buf_.insert(buf->name);
       auto buf_size = ir::Expr(1);
 
-      size_t max_size = std::max(buf->shape.size(), tensor->shape.size());
-      size_t min_size = std::min(buf->shape.size(), tensor->shape.size());
+      size_t max_dim = std::max(buf->shape.size(), tensor->shape.size());
+      size_t min_dim = std::min(buf->shape.size(), tensor->shape.size());
       size_t i = 0;
-      for (; i < min_size; ++i) {
-        auto e = expr->as_tensor()->shape[i];
-        auto buf_e = buf->shape[i];
+      for (; i < min_dim; ++i) {
+        Expr e = expr->as_tensor()->shape[i];
+        Expr buf_e = buf->shape[i];
         if (buf->memory_type == ir::MemoryType::GPULocal) {
           e = cinn::common::AutoSimplify(e);
           buf_e = cinn::common::AutoSimplify(buf_e);
@@ -77,7 +82,7 @@ struct Mutator : public ir::IRMutator<> {
         }
         buf_size = buf_size * buf_e;
       }
-      for (; i < max_size; i++) {
+      for (; i < max_dim; i++) {
         auto e = buf->shape.size() > tensor->shape.size() ? buf->shape[i]
                                                           : tensor->shape[i];
         if (buf->memory_type == ir::MemoryType::GPULocal) {
@@ -110,15 +115,60 @@ struct Mutator : public ir::IRMutator<> {
     }
   }
 
+  void VisitStmt(ir::stmt::Let stmt) override {
+    Expr body = stmt->body();
+    Visit(&body, &body);
+  }
+
+  void VisitStmt(ir::stmt::Store stmt) override {
+    Expr tensor = stmt->tensor();
+    Visit(&tensor, &tensor);
+  }
+
+  void VisitStmt(ir::stmt::For stmt) override {
+    Expr min = stmt->min();
+    Expr extent = stmt->extent();
+    Visit(&min, &min);
+    Visit(&extent, &extent);
+    VisitBlock(stmt->body());
+  }
+
+  void VisitStmt(ir::stmt::IfThenElse stmt) override {
+    Expr condition = stmt->condition();
+    Visit(&condition, &condition);
+    VisitBlock(stmt->true_case());
+    if (stmt->false_case().defined()) {
+      VisitBlock(stmt->false_case());
+    }
+  }
+
+  void VisitStmt(ir::stmt::Schedule stmt) override {
+    for (Expr read_buffer : stmt->read_buffers()) {
+      Visit(&read_buffer, &read_buffer);
+    }
+    for (Expr write_buffer : stmt->write_buffers()) {
+      Visit(&write_buffer, &write_buffer);
+    }
+    VisitBlock(stmt->body());
+  }
+
+  void VisitStmt(ir::stmt::Evaluate stmt) override {
+    Expr value = stmt->value();
+    Visit(&value, &value);
+  }
+
+  void VisitStmt(ir::stmt::Alloc stmt) override { return; }
+
+  void VisitStmt(ir::stmt::Free stmt) override { return; }
+
   size_t shared_mem_size_used_;
   std::unordered_set<std::string> visited_buf_;
 };
-
 }  // namespace
 
-void CudaTransBufferWithDynamicShape(ir::Expr* e) {
+LogicalResult TransBufferWithDynamicShapePass::Run(ir::LoweredFunc func) {
   Mutator mutator;
-  mutator.Visit(e, e);
+  mutator(func->body_block);
   cinn::common::DefaultDeviceTarget().arch.Match(
       [&](std::variant<common::UnknownArch, common::X86Arch, common::ARMArch>) {
       },
@@ -129,7 +179,7 @@ void CudaTransBufferWithDynamicShape(ir::Expr* e) {
         if (cur_dev_info->IsValid()) {
           size_t max_shm_per_block = cur_dev_info->GetMaxSharedMemPerBlock();
           PADDLE_ENFORCE_EQ(
-              (mutator.shared_mem_size_used_ <= max_shm_per_block),
+              (mutator.shared_mem_size_used() <= max_shm_per_block),
               true,
               ::common::errors::InvalidArgument(
                   "The shared memory size used by current kernel is greater "
@@ -144,12 +194,17 @@ void CudaTransBufferWithDynamicShape(ir::Expr* e) {
                 ->get_device_property(
                     BackendAPI::DeviceProperty::MaxSharedMemoryPerBlock);
         PADDLE_ENFORCE_LE(
-            mutator.shared_mem_size_used_,
+            mutator.shared_mem_size_used(),
             max_shm_per_block,
             ::common::errors::InvalidArgument(
                 "The shared memory size used by current kernel is greater "
                 "than the max shared memory per block"));
       },
       [&](common::HygonDCUArchSYCL) { CINN_NOT_IMPLEMENTED });
+  return LogicalResult::success();
+}
+
+std::unique_ptr<FuncPass> CreateTransBufferWithDynamicShapePass() {
+  return std::make_unique<TransBufferWithDynamicShapePass>();
 }
 }  // namespace cinn::optim
diff --git a/paddle/cinn/optim/trans_buffer_with_dynamic_shape.h b/paddle/cinn/optim/trans_buffer_with_dynamic_shape.h
@@ -16,15 +16,38 @@
 #include <string>
 
 #include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/pass/pass.h"
 
 namespace cinn {
 namespace optim {
 
+class TransBufferWithDynamicShapePass : public FuncPass {
+ public:
+  TransBufferWithDynamicShapePass()
+      : FuncPass("trans_buffer_with_dynamic_shape") {}
+  LogicalResult Run(ir::LoweredFunc func) override;
+};
+
 /**
- * Given Expr AST, translate dynamic shape in buffers to
- * static shape, the pass is just used on Nvidia GPU temporarily.
+ * Transforms buffers' dynamic shapes to constant shapes and perform shared
+ * memory usage checks.
+ *
+ * This pass is applicable in scenarios where tensor buffers have dynamic
+ * shapes, especially in GPU computations. It's crucial for ensuring correct
+ * memory allocation and preventing buffer overflows in shared memory usage on
+ * GPUs.
+ *
+ * When applied, this pass will analyze tensor buffers and their shapes,
+ * calculating the required memory size. For GPU local memory, it will attempt
+ * to determine upper bounds for dynamic shapes. For GPU shared memory, it will
+ * calculate the total shared memory usage and verify it against hardware
+ * limits.
+ *
+ * Risks and limitations:
+ * - Currently only checks shared memory usage against hardware limits for
+ * NVIDIA GPUs and Hygon DCU.
  */
-void CudaTransBufferWithDynamicShape(ir::Expr* expr);
+std::unique_ptr<FuncPass> CreateTransBufferWithDynamicShapePass();
 
 }  // namespace optim
 }  // namespace cinn
diff --git a/paddle/cinn/pass/pass_adaptor.h b/paddle/cinn/pass/pass_adaptor.h
@@ -50,11 +50,6 @@ class PassAdaptor {
 
 class FuncPassAdaptor : public PassAdaptor<FuncPass> {
  private:
-  LogicalResult RunPipeline(
-      ir::stmt::BlockRef block,
-      const std::vector<std::unique_ptr<FuncPass>>& passes) {
-    return LogicalResult::failure();
-  }
   LogicalResult Run(
       ir::LoweredFunc func,
       const std::vector<std::unique_ptr<FuncPass>>& passes) override;