Skip to content

Commit

Permalink
[MetaSchedule] Introduce Async Pipeline in MultiLevelTiling
Browse files Browse the repository at this point in the history
This PR introduces async pipeline in the current TVM's MultiLevelTiling Rules. This PR is blocking on apache#13966 since some conv2d workload will use `tir.if_then_else` to pad the input to the correct size, and this PR uses async copy in such copy statement.

1. Add a subrule in `src/meta_schedule/schedule_rule/multi_level_tiling.h/.cc` that annotate async copy for mlt.

In CUDA Core, this PR has a perf boost of around 1T GFLOP/s in most Conv2d test cases and 1T ~ 2T in most GEMM test cases.

All generated codes, scripts, and traces are available at https://github.com/Rainy-Memory/tvm-async-rule-benchmark.

Currently tested on commit `afbfb7aa7e43732cb716f8e443df696110be6afc` in conv2d NHWC workload, with a RTX 3080 GPU.

Workload: Conv2d NHWC

|Shape|Mainline TVM|Mainline TVM with Async|
|-|-|-|
|N=1_H=224_W=224_C=3_K=64_R=7_S=7_STR=2_PAD=3_DIL=1|13838.05219|14687.89452|
|N=1_H=56_W=56_C=64_K=64_R=1_S=1_STR=1_PAD=0_DIL=1|5398.305085|5613.892553|
|N=1_H=56_W=56_C=64_K=64_R=3_S=3_STR=1_PAD=1_DIL=1|11652.96825|13157.88249|
|N=1_H=56_W=56_C=64_K=256_R=1_S=1_STR=1_PAD=0_DIL=1|10638.8309|11674.68499|
|N=1_H=56_W=56_C=256_K=64_R=1_S=1_STR=1_PAD=0_DIL=1|8692.32829|9469.264089|
|N=1_H=56_W=56_C=256_K=128_R=1_S=1_STR=2_PAD=0_DIL=1|4685.767442|5698.19634|
|N=1_H=28_W=28_C=128_K=128_R=3_S=3_STR=1_PAD=1_DIL=1|9872.787087|10404.60405|
|N=1_H=28_W=28_C=128_K=512_R=1_S=1_STR=1_PAD=0_DIL=1|9974.281496|10073.31657|
|N=1_H=28_W=28_C=512_K=128_R=1_S=1_STR=1_PAD=0_DIL=1|7075.866932|8564.572712|
|N=1_H=28_W=28_C=512_K=256_R=1_S=1_STR=2_PAD=0_DIL=1|3648.330914|4021.923142|
|N=1_H=14_W=14_C=256_K=256_R=3_S=3_STR=1_PAD=1_DIL=1|8192.954618|9160.182054|
|N=1_H=14_W=14_C=256_K=1024_R=1_S=1_STR=1_PAD=0_DIL=1|8008.870153|9362.825279|
|N=1_H=14_W=14_C=1024_K=256_R=1_S=1_STR=1_PAD=0_DIL=1|5210.062241|6051.208379|
|N=1_H=14_W=14_C=1024_K=512_R=1_S=1_STR=2_PAD=0_DIL=1|2550.787202|3587.902938|
|N=1_H=7_W=7_C=512_K=512_R=3_S=3_STR=1_PAD=1_DIL=1|4350.626084|5432.788068|
|N=1_H=7_W=7_C=512_K=2048_R=1_S=1_STR=1_PAD=0_DIL=1|6672.068026|7663.725217|
|N=1_H=7_W=7_C=2048_K=512_R=1_S=1_STR=1_PAD=0_DIL=1|3142.564263|4297.988014|

Workload: GEMM NN

|Shape|Mainline TVM|Mainline TVM with Async|
|-|-|-|
|M=512_N=256_K=640|8678.46|10607.37|
|M=512_N=384_K=256|8109.13|10290.72|
|M=512_N=512_K=512|11419.83|14000.86|
|M=512_N=3072_K=768|19709.39|18351.61|
|M=512_N=768_K=3072|12844.59|13730.88|
|M=896_N=896_K=896|16149.91|16131.39|
|M=1024_N=1024_K=1024|18842.11|19662.8|
|M=1152_N=1152_K=1152|15386.79|16736.1|
|M=1536_N=1536_K=1536|18522.67|18872.06|
|M=2048_N=2048_K=2048|19515.42|18874.85|
|M=3072_N=3072_K=3072|19233.9|19291.42|
|M=4096_N=4096_K=4096|17122.17|19259.01|
  • Loading branch information
cblmemo authored and junrushao committed Feb 17, 2023
1 parent d7253fb commit a27315c
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 0 deletions.
55 changes: 55 additions & 0 deletions src/meta_schedule/schedule_rule/multi_level_tiling.cc
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,21 @@ void MultiLevelTilingNode::InitializeWithTuneContext(const TuneContext& context)
TVM_PY_LOG(INFO, context->logger) << "'thread_warp_size' is not defined in the target";
}
}
if (Optional<String> opt_sm = context->target.value()->GetAttr<String>("arch")) {
std::string sm = opt_sm.value();
if (support::StartsWith(sm, "sm_")) {
sm = sm.substr(3);
try {
// only sm_80 or higher supports async memcopy
if (std::stoi(sm) >= 80) {
this->stages.insert(this->stages.end(), {4, 5});
}
} catch (const std::invalid_argument& e) {
LOG(WARNING) << "ValueError: Unable to parse `target.arch`: " << sm
<< ". Details: " << e.what();
}
}
}
logger = context->logger;
}

Expand Down Expand Up @@ -115,6 +130,9 @@ std::vector<State> MultiLevelTilingNode::ApplySubRules(std::vector<State> states
states = SubRule(std::move(states), [&](State state) { return TileLoopNest(std::move(state)); });
states = SubRule(std::move(states), [&](State state) { return AddWriteReuse(std::move(state)); });
states = SubRule(std::move(states), [&](State state) { return AddReadReuse(std::move(state)); });
states = SubRule(std::move(states), [&](State state) {
return AddAsyncPipeline(std::move(state));
});
return states;
}

Expand Down Expand Up @@ -280,6 +298,43 @@ std::vector<State> MultiLevelTilingNode::AddReadReuse(State state) const {
return results;
}

std::vector<State> MultiLevelTilingNode::AddAsyncPipeline(State state) const {
// For arch that does not support async pipeline, this->stages will be an empty vector
if (r_indices_.size() < 1 || this->stages.empty()) {
return {state};
}
// Current only support default config used by ScheduleRule::DefaultCUDA
// @see src/meta_schedule/schedule_rule/schedule_rule.cc
// check the reduce loop contains exactly 3 for loops
// therefore it matches the notation array size in the following code
tir::StmtSRef r_loop_sref = state->sch->GetSRef(state->tiles[r_indices_[0]].back());
const tir::ForNode* r_for_loop = TVM_SREF_TO_FOR(r_loop_sref);
Array<tir::Stmt> seq = Downcast<tir::SeqStmt>(r_for_loop->body)->seq;
if (seq.size() != 3) {
return {state};
}
for (auto& stmt : seq) {
if (!stmt.as<tir::ForNode>()) {
return {state};
}
}

LoopRV r_loop_fused = state->sch->Fuse(state->tiles[r_indices_[0]]);
std::vector<State> ret;
ret.push_back(state);
for (int stage : this->stages) {
State new_state = state->Copy();
new_state->sch->Annotate(r_loop_fused, tir::attr::software_pipeline_stage,
Array<Integer>{0, 0, stage - 2});
new_state->sch->Annotate(r_loop_fused, tir::attr::software_pipeline_order,
Array<Integer>{0, 1, 2});
new_state->sch->Annotate(r_loop_fused, tir::attr::software_pipeline_async_stages,
Array<Integer>{0});
ret.push_back(std::move(new_state));
}
return ret;
}

void MultiLevelTilingNode::AnnotateCooperativeFetching(Schedule* sch,
const tir::BlockRV& block) const {
// Filter out invalid vector lanes according to the data type.
Expand Down
4 changes: 4 additions & 0 deletions src/meta_schedule/schedule_rule/multi_level_tiling.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,8 @@ class MultiLevelTilingNode : public ScheduleRuleNode {
std::vector<State> TileLoopNest(State state) const;
// SubRule 3. add read cache
std::vector<State> AddReadReuse(State state) const;
// SubRule 4. add async pipeline
std::vector<State> AddAsyncPipeline(State state) const;

// Do nothing; Inherited from ScheduleRuleNode
void InitializeWithTuneContext(const TuneContext& context) final;
Expand Down Expand Up @@ -192,6 +194,8 @@ class MultiLevelTilingNode : public ScheduleRuleNode {
int thread_warp_size_;
/*! \brief The maximum number of threads to be used size of a thread warp */
int max_threads_per_block_;
/*! \brief All available async pipeline stages. */
std::vector<int> stages;
/*! \brief The logging function */
PackedFunc logger;
/*! \brief The function to overwrite the default condition for applying MultiLevelTiling. */
Expand Down

0 comments on commit a27315c

Please sign in to comment.