Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CINN][Backend Pass Update No.4] Update TransBufferWithDynamicShape pass #70548

Merged
merged 2 commits into from
Jan 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion paddle/cinn/optim/optimize.cc
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,11 @@ ir::LoweredFunc Optimize(ir::LoweredFunc fn,
RemoveGpuForLoops(copied);
}
CudaSyncThreadsDropIfThenElse(copied);
// CudaTransBufferWithDynamicShape(&copied);
FuncPassManager func_pass_manager;
VLOG(10) << "Before Optimize TransBufferWithDynamicShape:" << copied;
func_pass_manager.AddPass(CreateTransBufferWithDynamicShapePass());
func_pass_manager.Run(copied);
VLOG(10) << "After Optimize TransBufferWithDynamicShape:" << copied;
#endif
},
[&](common::HygonDCUArchHIP) {
Expand Down
79 changes: 67 additions & 12 deletions paddle/cinn/optim/trans_buffer_with_dynamic_shape.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,24 +35,29 @@ namespace {
common::cas_intervals_t var_intervals = {};
cinn::common::SymbolicExprAnalyzer analyzer(var_intervals);

struct Mutator : public ir::IRMutator<> {
struct Mutator : public ir::IRMutator<>, public ir::stmt::StmtMutator<> {
using ir::IRMutator<>::Visit;

Mutator() : shared_mem_size_used_(0) {}

void operator()(ir::stmt::BlockRef block) { VisitBlock(block); }

size_t shared_mem_size_used() const { return shared_mem_size_used_; }

private:
void Visit(const ir::_Tensor_* tensor, Expr* expr) override {
if (!tensor->buffer.defined()) return;
auto buf = tensor->buffer.As<ir::_Buffer_>();
if (!visited_buf_.count(buf->name)) {
visited_buf_.insert(buf->name);
auto buf_size = ir::Expr(1);

size_t max_size = std::max(buf->shape.size(), tensor->shape.size());
size_t min_size = std::min(buf->shape.size(), tensor->shape.size());
size_t max_dim = std::max(buf->shape.size(), tensor->shape.size());
size_t min_dim = std::min(buf->shape.size(), tensor->shape.size());
size_t i = 0;
for (; i < min_size; ++i) {
auto e = expr->as_tensor()->shape[i];
auto buf_e = buf->shape[i];
for (; i < min_dim; ++i) {
Expr e = expr->as_tensor()->shape[i];
Expr buf_e = buf->shape[i];
if (buf->memory_type == ir::MemoryType::GPULocal) {
e = cinn::common::AutoSimplify(e);
buf_e = cinn::common::AutoSimplify(buf_e);
Expand All @@ -77,7 +82,7 @@ struct Mutator : public ir::IRMutator<> {
}
buf_size = buf_size * buf_e;
}
for (; i < max_size; i++) {
for (; i < max_dim; i++) {
auto e = buf->shape.size() > tensor->shape.size() ? buf->shape[i]
: tensor->shape[i];
if (buf->memory_type == ir::MemoryType::GPULocal) {
Expand Down Expand Up @@ -110,15 +115,60 @@ struct Mutator : public ir::IRMutator<> {
}
}

void VisitStmt(ir::stmt::Let stmt) override {
Expr body = stmt->body();
Visit(&body, &body);
}

void VisitStmt(ir::stmt::Store stmt) override {
Expr tensor = stmt->tensor();
Visit(&tensor, &tensor);
}

void VisitStmt(ir::stmt::For stmt) override {
Expr min = stmt->min();
Expr extent = stmt->extent();
Visit(&min, &min);
Visit(&extent, &extent);
VisitBlock(stmt->body());
}

void VisitStmt(ir::stmt::IfThenElse stmt) override {
Expr condition = stmt->condition();
Visit(&condition, &condition);
VisitBlock(stmt->true_case());
if (stmt->false_case().defined()) {
VisitBlock(stmt->false_case());
}
}

void VisitStmt(ir::stmt::Schedule stmt) override {
for (Expr read_buffer : stmt->read_buffers()) {
Visit(&read_buffer, &read_buffer);
}
for (Expr write_buffer : stmt->write_buffers()) {
Visit(&write_buffer, &write_buffer);
}
VisitBlock(stmt->body());
}

void VisitStmt(ir::stmt::Evaluate stmt) override {
Expr value = stmt->value();
Visit(&value, &value);
}

void VisitStmt(ir::stmt::Alloc stmt) override { return; }

void VisitStmt(ir::stmt::Free stmt) override { return; }

size_t shared_mem_size_used_;
std::unordered_set<std::string> visited_buf_;
};

} // namespace

void CudaTransBufferWithDynamicShape(ir::Expr* e) {
LogicalResult TransBufferWithDynamicShapePass::Run(ir::LoweredFunc func) {
Mutator mutator;
mutator.Visit(e, e);
mutator(func->body_block);
cinn::common::DefaultDeviceTarget().arch.Match(
[&](std::variant<common::UnknownArch, common::X86Arch, common::ARMArch>) {
},
Expand All @@ -129,7 +179,7 @@ void CudaTransBufferWithDynamicShape(ir::Expr* e) {
if (cur_dev_info->IsValid()) {
size_t max_shm_per_block = cur_dev_info->GetMaxSharedMemPerBlock();
PADDLE_ENFORCE_EQ(
(mutator.shared_mem_size_used_ <= max_shm_per_block),
(mutator.shared_mem_size_used() <= max_shm_per_block),
true,
::common::errors::InvalidArgument(
"The shared memory size used by current kernel is greater "
Expand All @@ -144,12 +194,17 @@ void CudaTransBufferWithDynamicShape(ir::Expr* e) {
->get_device_property(
BackendAPI::DeviceProperty::MaxSharedMemoryPerBlock);
PADDLE_ENFORCE_LE(
mutator.shared_mem_size_used_,
mutator.shared_mem_size_used(),
max_shm_per_block,
::common::errors::InvalidArgument(
"The shared memory size used by current kernel is greater "
"than the max shared memory per block"));
},
[&](common::HygonDCUArchSYCL) { CINN_NOT_IMPLEMENTED });
return LogicalResult::success();
}

std::unique_ptr<FuncPass> CreateTransBufferWithDynamicShapePass() {
return std::make_unique<TransBufferWithDynamicShapePass>();
}
} // namespace cinn::optim
29 changes: 26 additions & 3 deletions paddle/cinn/optim/trans_buffer_with_dynamic_shape.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,38 @@
#include <string>

#include "paddle/cinn/ir/ir.h"
#include "paddle/cinn/pass/pass.h"

namespace cinn {
namespace optim {

class TransBufferWithDynamicShapePass : public FuncPass {
public:
TransBufferWithDynamicShapePass()
: FuncPass("trans_buffer_with_dynamic_shape") {}
LogicalResult Run(ir::LoweredFunc func) override;
};

/**
* Given Expr AST, translate dynamic shape in buffers to
* static shape, the pass is just used on Nvidia GPU temporarily.
* Transforms buffers' dynamic shapes to constant shapes and perform shared
* memory usage checks.
*
* This pass is applicable in scenarios where tensor buffers have dynamic
* shapes, especially in GPU computations. It's crucial for ensuring correct
* memory allocation and preventing buffer overflows in shared memory usage on
* GPUs.
*
* When applied, this pass will analyze tensor buffers and their shapes,
* calculating the required memory size. For GPU local memory, it will attempt
* to determine upper bounds for dynamic shapes. For GPU shared memory, it will
* calculate the total shared memory usage and verify it against hardware
* limits.
*
* Risks and limitations:
* - Currently only checks shared memory usage against hardware limits for
* NVIDIA GPUs and Hygon DCU.
*/
void CudaTransBufferWithDynamicShape(ir::Expr* expr);
std::unique_ptr<FuncPass> CreateTransBufferWithDynamicShapePass();
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

函数签名去掉了"Cuda"前缀,是因为在检查共享内存是否超限时提供了HygonDCUArchHIP版本的实现


} // namespace optim
} // namespace cinn
5 changes: 0 additions & 5 deletions paddle/cinn/pass/pass_adaptor.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,6 @@ class PassAdaptor {

class FuncPassAdaptor : public PassAdaptor<FuncPass> {
private:
LogicalResult RunPipeline(
ir::stmt::BlockRef block,
const std::vector<std::unique_ptr<FuncPass>>& passes) {
return LogicalResult::failure();
}
LogicalResult Run(
ir::LoweredFunc func,
const std::vector<std::unique_ptr<FuncPass>>& passes) override;
Expand Down
Loading