From 5548c55abaaf9048331820842ebb0e8af93d3bd9 Mon Sep 17 00:00:00 2001 From: xumingkuan Date: Thu, 20 Aug 2020 23:07:18 +0800 Subject: [PATCH] [Opt] Identical store/load elimination by control-flow graph (#1741) --- taichi/ir/control_flow_graph.cpp | 122 +++++++++++++-- taichi/ir/control_flow_graph.h | 5 + taichi/transforms/simplify.cpp | 247 ------------------------------- 3 files changed, 118 insertions(+), 256 deletions(-) diff --git a/taichi/ir/control_flow_graph.cpp b/taichi/ir/control_flow_graph.cpp index 73ceedfe547ed..a9f8dfe494e1c 100644 --- a/taichi/ir/control_flow_graph.cpp +++ b/taichi/ir/control_flow_graph.cpp @@ -179,8 +179,14 @@ Stmt *CFGNode::get_store_forwarding_data(Stmt *var, int position) const { return result; } Stmt *result = nullptr; + bool result_visible = false; auto visible = [&](Stmt *stmt) { - // Do we need to check if `stmt` is before `position` here? + // Check if `stmt` is before `position` here. + if (stmt->parent == block) { + return stmt->parent->locate(stmt) < position; + } + // TODO: What if `stmt` appears in an ancestor of `block` but after + // `position`? return parent_blocks.find(stmt->parent) != parent_blocks.end(); }; auto update_result = [&](Stmt *stmt) { @@ -190,7 +196,10 @@ Stmt *CFGNode::get_store_forwarding_data(Stmt *var, int position) const { } if (!result) { result = data; - } else if (!irpass::analysis::same_statements(result, data)) { + result_visible = visible(data); + return true; // continue the following loops + } + if (!irpass::analysis::same_statements(result, data)) { // check the special case of alloca (initialized to 0) if (!(result->is() && data->is() && data->width() == 1 && @@ -198,8 +207,11 @@ Stmt *CFGNode::get_store_forwarding_data(Stmt *var, int position) const { return false; // return nullptr } } - if (visible(data)) + if (!result_visible && visible(data)) { + // pick the visible one for store-to-load forwarding result = data; + result_visible = true; + } return true; // continue the following loops }; for (auto stmt : reach_in) { @@ -223,7 +235,7 @@ Stmt *CFGNode::get_store_forwarding_data(Stmt *var, int position) const { block->statements[position]->id); return nullptr; } - if (!visible(result)) { + if (!result_visible) { return nullptr; } return result; @@ -264,6 +276,46 @@ bool CFGNode::store_to_load_forwarding(bool after_lower_access) { i--; // to cancel i++ in the for loop modified = true; } + continue; + } + // Identical store elimination + if (auto local_store = stmt->cast()) { + result = get_store_forwarding_data(local_store->ptr, i); + if (result) { + if (result->is()) { + // special case of alloca (initialized to 0) + if (auto stored_data = local_store->data->cast()) { + bool all_zero = true; + for (auto &val : stored_data->val.data) { + if (!val.equal_value(0)) { + all_zero = false; + break; + } + } + if (all_zero) { + erase(i); // This causes end_location-- + i--; // to cancel i++ in the for loop + modified = true; + } + } + } else { + // not alloca + if (irpass::analysis::same_statements(result, local_store->data)) { + erase(i); // This causes end_location-- + i--; // to cancel i++ in the for loop + modified = true; + } + } + } + } else if (auto global_store = stmt->cast()) { + if (!after_lower_access) { + result = get_store_forwarding_data(global_store->ptr, i); + if (irpass::analysis::same_statements(result, global_store->data)) { + erase(i); // This causes end_location-- + i--; // to cancel i++ in the for loop + modified = true; + } + } } } return modified; @@ -342,6 +394,7 @@ bool CFGNode::dead_store_elimination(bool after_lower_access) { replace_with(i, std::move(local_load), true); // Notice that we have a load here. live_in_this_node.insert(atomic->dest); + killed_in_this_node.erase(atomic->dest); modified = true; continue; } else if (!is_parallel_executed) { @@ -354,6 +407,11 @@ bool CFGNode::dead_store_elimination(bool after_lower_access) { replace_with(i, std::move(global_load), true); // Notice that we have a load here. live_in_this_node.insert(atomic->dest); + // Note: It's possible that a global pointer is not erased from + // killed_in_this_node although it should be. This may harm the + // performance of identical load elimination but it's faster than + // checking the contents one by one. + killed_in_this_node.erase(atomic->dest); modified = true; continue; } @@ -375,11 +433,55 @@ bool CFGNode::dead_store_elimination(bool after_lower_access) { } } auto load_ptrs = irpass::analysis::get_load_pointers(stmt); + if (load_ptrs.size() == 1 && store_ptrs.empty() && stmt->width() == 1) { + // Identical load elimination + auto load_ptr = load_ptrs.front(); + if (!after_lower_access || + (load_ptr->is() || load_ptr->is())) { + // After lower_access, we only analyze local variables and stacks. + if (!may_contain_variable(killed_in_this_node, load_ptr) && + contain_variable(live_in_this_node, load_ptr)) { + // Only perform identical load elimination within a CFGNode. + for (int j = i + 1; j < end_location; j++) { + auto next_load_ptrs = + irpass::analysis::get_load_pointers(block->statements[j].get()); + bool found = false; + for (auto &next_load_ptr : next_load_ptrs) { + if (irpass::analysis::maybe_same_address(load_ptr, + next_load_ptr)) { + found = true; + break; + } + } + if (found) { + if (irpass::analysis::same_statements( + stmt, block->statements[j].get())) { + block->statements[j]->replace_with(stmt); + erase(j); + modified = true; + break; + } else { + TI_WARN("Identical load elimination failed."); + } + } + } + } + } + } for (auto &load_ptr : load_ptrs) { if (!after_lower_access || (load_ptr->is() || load_ptr->is())) { // After lower_access, we only analyze local variables and stacks. live_in_this_node.insert(load_ptr); + if (store_ptrs.empty()) { + // Only allow identical load elimination (i.e. allow this statement + // to be eliminated) if this statement doesn't store any data. + // Note: It's possible that a global pointer is not erased from + // killed_in_this_node although it should be. This may harm the + // performance of identical load elimination but it's faster than + // checking the contents one by one. + killed_in_this_node.erase(load_ptr); + } } } } @@ -484,12 +586,14 @@ void ControlFlowGraph::reaching_definition_analysis(bool after_lower_access) { if (!after_lower_access) { for (int i = 0; i < num_nodes; i++) { for (int j = nodes[i]->begin_location; j < nodes[i]->end_location; j++) { - if (auto global_load = - nodes[i]->block->statements[j]->cast()) { - nodes[start_node]->reach_gen.insert(global_load->ptr); + auto stmt = nodes[i]->block->statements[j].get(); + if (stmt->is() || stmt->is() || + stmt->is() || stmt->is() || + stmt->is()) { + // TODO: unify them + // A global pointer that may contain some data before this kernel. + nodes[start_node]->reach_gen.insert(stmt); } - // Since we only do store-to-load forwarding, we don't need to mark - // other global pointers' data source at the start node. } } } diff --git a/taichi/ir/control_flow_graph.h b/taichi/ir/control_flow_graph.h index f391b1a95f6f8..932c38e735215 100644 --- a/taichi/ir/control_flow_graph.h +++ b/taichi/ir/control_flow_graph.h @@ -88,9 +88,14 @@ class ControlFlowGraph { void live_variable_analysis(bool after_lower_access); void simplify_graph(); + // This pass cannot eliminate container statements properly for now. bool unreachable_code_elimination(); + + // Also performs identical store elimination. bool store_to_load_forwarding(bool after_lower_access); + + // Also performs identical load elimination. bool dead_store_elimination(bool after_lower_access); }; diff --git a/taichi/transforms/simplify.cpp b/taichi/transforms/simplify.cpp index 83856500d73a6..3e949752f1005 100644 --- a/taichi/transforms/simplify.cpp +++ b/taichi/transforms/simplify.cpp @@ -122,253 +122,6 @@ class BasicBlockSimplify : public IRVisitor { return false; } - void visit(LocalLoadStmt *stmt) override { - if (is_done(stmt)) - return; - - // TODO: Implement this in cfg_optimization - /* - // Merge identical loads - for (int i = 0; i < current_stmt_id; i++) { - auto &bstmt = block->statements[i]; - if (stmt->ret_type == bstmt->ret_type) { - auto &bstmt_data = *bstmt; - if (typeid(bstmt_data) == typeid(*stmt)) { - auto bstmt_ = bstmt->as(); - bool same = true; - std::vector vars; - for (int l = 0; l < stmt->width(); l++) { - vars.push_back(stmt->ptr[l].var); - if (stmt->ptr[l].var != bstmt_->ptr[l].var || - stmt->ptr[l].offset != bstmt_->ptr[l].offset) { - same = false; - break; - } - } - if (same) { - // no store to the var? - bool has_related_store = false; - auto advanced_optimization = - block->get_config().advanced_optimization; - for (int j = i + 1; j < current_stmt_id; j++) { - if (!advanced_optimization) { - if (block->statements[j] - ->is_container_statement()) { // no if, while, etc.. - has_related_store = true; - break; - } - if (modifies_local(block->statements[j].get(), vars)) { - has_related_store = true; - } - continue; - } - if (irpass::analysis::has_store_or_atomic( - block->statements[j].get(), vars)) { - has_related_store = true; - break; - } - } - if (!has_related_store) { - stmt->replace_with(bstmt.get()); - stmt->parent->erase(current_stmt_id); - throw IRModified(); - } - } - } - } - } - - // store-forwarding - bool regular = true; - auto alloca = stmt->ptr[0].var; - for (int l = 0; l < stmt->width(); l++) { - if (stmt->ptr[l].offset != l || stmt->ptr[l].var != alloca) { - regular = false; - } - } - if (regular) { - // Check all previous statements in the current block before the local - // load - Stmt *containing_statement = stmt; - auto stmt_id = block->locate(containing_statement); - TI_ASSERT(stmt_id != -1); - for (int i = stmt_id - 1; i >= 0; i--) { - if (!block->get_config().advanced_optimization) { - auto &bstmt = block->statements[i]; - // Find a previous store - if (auto s = bstmt->cast()) { - if (s->dest == alloca) { - break; - } - } - if (bstmt->is()) { - auto bstmt_ = bstmt->as(); - // Same alloca - if (bstmt_->ptr == alloca) { - // Forward to the first local store only - stmt->replace_with(bstmt_->data); - stmt->parent->erase(current_stmt_id); - throw IRModified(); - } - } else if (bstmt->is_container_statement()) { - // assume this container may modify the local var - break; - } - continue; - } - auto last_store = irpass::analysis::last_store_or_atomic( - block->statements[i].get(), alloca); - if (!last_store.first) { - // invalid - break; - } - auto bstmt = last_store.second; - if (bstmt != nullptr) { - if (bstmt->is()) { - // Forward to the first local store only - stmt->replace_with(bstmt->as()->data); - stmt->parent->erase(current_stmt_id); - throw IRModified(); - } else { - TI_ASSERT(bstmt->is()); - auto zero = stmt->insert_after_me(Stmt::make( - LaneAttribute(bstmt->ret_type.data_type))); - zero->repeat(stmt->width()); - stmt->replace_with(zero); - stmt->parent->erase(current_stmt_id); - throw IRModified(); - } - } - } - // Note: simply checking all statements before stmt is not sufficient - // since statements after stmt may change the value of the alloca - // For example, in a loop, later part of the loop body may alter the local - // var value. - } - */ - set_done(stmt); - } - - void visit(LocalStoreStmt *stmt) override { - if (is_done(stmt)) - return; - - // TODO: Implement this in cfg_optimization - /* - // has previous store? - for (int i = 0; i < current_stmt_id; i++) { - auto &bstmt = block->statements[i]; - if (stmt->ret_type == bstmt->ret_type) { - auto &bstmt_data = *bstmt; - if (typeid(bstmt_data) == typeid(*stmt)) { - auto bstmt_ = bstmt->as(); - bool same = stmt->ptr == bstmt_->ptr; - if (same) { - bool has_load = false; - for (int j = i + 1; j < current_stmt_id; j++) { - if (!block->get_config().advanced_optimization) { - if (block->statements[j] - ->is_container_statement()) { // no if, while, etc.. - has_load = true; - break; - } - if (block->statements[j]->is() && - block->statements[j]->as()->has_source( - stmt->ptr)) { - has_load = true; - } - if (block->statements[j]->is() && - (block->statements[j]->as()->dest == - stmt->ptr)) { - // $a = alloca - // $b : local store [$a <- v1] <-- prev lstore |bstmt_| - // $c = atomic add($a, v2) <-- cannot eliminate $b - // $d : local store [$a <- v3] - has_load = true; - } - continue; - } - if (!irpass::analysis::gather_statements( - block->statements[j].get(), - [&](Stmt *s) { - if (auto load = s->cast()) - return load->has_source(stmt->ptr); - else if (auto atomic = s->cast()) - return atomic->dest == stmt->ptr; - else - return s->is() || - s->is(); - }) - .empty()) { - has_load = true; - break; - } - } - if (!has_load) { - stmt->parent->erase(bstmt_); - throw IRModified(); - } - } - } - } - } - - // Does it have a following load? If not, delete. - if (stmt->parent->locate(stmt->ptr) != -1) { - // optimize variables local to this block only - bool has_related = false; - for (int i = current_stmt_id + 1; i < (int)block->statements.size(); - i++) { - if (!block->get_config().advanced_optimization) { - auto &bstmt = block->statements[i]; - if (bstmt->is_container_statement()) { - has_related = true; - break; - } - if (bstmt->is()) { - auto bstmt_ = bstmt->as(); - if (bstmt_->has_source(stmt->ptr)) { - has_related = true; - break; - } - } - if (bstmt->is()) { - // $a = alloca - // $b : local store [$a <- v1] - // $c = atomic add($a, v2) <-- cannot eliminate $b - auto bstmt_ = bstmt->as(); - if (bstmt_->dest == stmt->ptr) { - has_related = true; - break; - } - } - continue; - } - if (!irpass::analysis::gather_statements( - block->statements[i].get(), - [&](Stmt *s) { - if (auto load = s->cast()) - return load->has_source(stmt->ptr); - else if (auto atomic = s->cast()) - return atomic->dest == stmt->ptr; - else - return false; - }) - .empty()) { - has_related = true; - break; - } - } - if (!has_related) { - stmt->parent->erase(stmt); - throw IRModified(); - } - } - */ - - set_done(stmt); - } - void visit(GlobalLoadStmt *stmt) override { if (is_done(stmt)) return;