Skip to content

Commit

Permalink
* format
Browse files Browse the repository at this point in the history
* added testing
  • Loading branch information
sjw36 committed Nov 8, 2024
1 parent 091606f commit f7200cb
Show file tree
Hide file tree
Showing 2 changed files with 117 additions and 11 deletions.
109 changes: 109 additions & 0 deletions test/TritonGPU/loop-pipeline.mlir
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// RUN: triton-opt %s -split-input-file -tritongpu-pipeline=num-stages=3 -canonicalize | FileCheck %s --check-prefixes=COMMON,CHECK
// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline-v2=num_stages=2 -canonicalize | FileCheck %s --check-prefixes=COMMON,AMD
// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline-v2="num_stages=2 prefetch=1" -canonicalize | FileCheck %s --check-prefixes=COMMON,AMD_PREFETCH

// 4 warps
// matmul: 128x32 @ 32x128 -> 128x128
Expand Down Expand Up @@ -99,6 +100,34 @@
// AMD: triton_gpu.local_dealloc %{{.*}}
// AMD: triton_gpu.local_dealloc %{{.*}}

// Prefetch pipelining adds another stage in between global load and compute.
// This stage will local_store, then local_load, creating a prefetch from shared
// memory into a register buffer for compute.
//
// AMD_PREFETCH-LABEL: tt.func @matmul_loop
// AMD_PREFETCH: triton_gpu.local_alloc
// AMD_PREFETCH: triton_gpu.local_alloc
// AMD_PREFETCH: tt.load
// AMD_PREFETCH: tt.load
// AMD_PREFETCH: triton_gpu.local_store
// AMD_PREFETCH: triton_gpu.local_store
// AMD_PREFETCH: tt.load
// AMD_PREFETCH: tt.load
// AMD_PREFETCH: triton_gpu.local_load
// AMD_PREFETCH: triton_gpu.local_load
// AMD_PREFETCH: scf.for
// AMD_PREFETCH: triton_gpu.local_store
// AMD_PREFETCH: triton_gpu.local_store
// AMD_PREFETCH: tt.load
// AMD_PREFETCH: tt.load
// AMD_PREFETCH: tt.dot
// AMD_PREFETCH: triton_gpu.local_load
// AMD_PREFETCH: triton_gpu.local_load
// AMD_PREFETCH: scf.yield
// AMD_PREFETCH: tt.dot
// AMD_PREFETCH: tt.dot
// AMD_PREFETCH: tt.return

module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.num-ctas" = 1 : i32} {
tt.func @matmul_loop(%lb : index, %ub : index, %step : index,
%A : !tt.ptr<f16> {tt.divisibility = 16 : i32},
Expand Down Expand Up @@ -219,6 +248,8 @@ tt.func @matmul_loop(%lb : index, %ub : index, %step : index,
// AMD-COUNT-2: triton_gpu.local_dealloc
// AMD: scf.yield %[[SEL1]]

// AMD_PREFETCH-LABEL: tt.func @matmul_loop_nested

tt.func @matmul_loop_nested(%lb : index, %ub : index, %step : index,
%A : !tt.ptr<f16> {tt.divisibility = 16 : i32},
%B : !tt.ptr<f16> {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #C>{
Expand Down Expand Up @@ -313,6 +344,22 @@ tt.func @matmul_loop_nested(%lb : index, %ub : index, %step : index,
// AMD: scf.yield %[[ADDPTR_32]], %[[DOT_31]], %[[SELECT_36]], %[[MEMDESC_SUBVIEW_37]]
// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_12]]

// AMD_PREFETCH-LABEL: tt.func @matmul_loop_single_pipeline
// AMD_PREFETCH: triton_gpu.local_alloc
// AMD_PREFETCH: tt.load
// AMD_PREFETCH: triton_gpu.local_store
// AMD_PREFETCH: tt.load
// AMD_PREFETCH: triton_gpu.local_load
// AMD_PREFETCH: scf.for
// AMD_PREFETCH: triton_gpu.local_store
// AMD_PREFETCH: tt.load
// AMD_PREFETCH: tt.dot
// AMD_PREFETCH: triton_gpu.local_load
// AMD_PREFETCH: scf.yield
// AMD_PREFETCH: tt.dot
// AMD_PREFETCH: tt.dot
// AMD_PREFETCH: tt.return

tt.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index,
%A : !tt.ptr<f16> {tt.divisibility = 16 : i32},
%B : !tt.ptr<f16> {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #C> {
Expand Down Expand Up @@ -445,6 +492,33 @@ tt.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index,
// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_0]]
// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_1]]

// AMD_PREFETCH-LABEL: tt.func @indirect_bmm_scalar
// AMD_PREFETCH: triton_gpu.local_alloc
// AMD_PREFETCH: triton_gpu.local_alloc
// AMD_PREFETCH: tt.load
// AMD_PREFETCH: tt.load
// AMD_PREFETCH: tt.load
// AMD_PREFETCH: triton_gpu.local_store
// AMD_PREFETCH: triton_gpu.local_store
// AMD_PREFETCH: tt.load
// AMD_PREFETCH: tt.load
// AMD_PREFETCH: tt.load
// AMD_PREFETCH: triton_gpu.local_load
// AMD_PREFETCH: triton_gpu.local_load
// AMD_PREFETCH: scf.for
// AMD_PREFETCH: triton_gpu.local_store
// AMD_PREFETCH: triton_gpu.local_store
// AMD_PREFETCH: tt.load
// AMD_PREFETCH: tt.load
// AMD_PREFETCH: tt.load
// AMD_PREFETCH: tt.dot
// AMD_PREFETCH: triton_gpu.local_load
// AMD_PREFETCH: scf.yield
// AMD_PREFETCH: tt.dot
// AMD_PREFETCH: tt.dot
// AMD_PREFETCH: tt.dot
// AMD_PREFETCH: tt.return

tt.func @indirect_bmm_scalar(%77: i64 {tt.divisibility=16: i32},
%76: index,
%49: tensor<16x16x!tt.ptr<f16>, #AL> {tt.divisibility=16: i32, tt.contiguity=2 : i32},
Expand Down Expand Up @@ -499,6 +573,8 @@ tt.func @indirect_bmm_scalar(%77: i64 {tt.divisibility=16: i32},
// AMD: triton_gpu.local_store
// AMD: scf.yield

// AMD_PREFETCH-LABEL: tt.func @indirect_bmm_scalar_dist_one

tt.func @indirect_bmm_scalar_dist_one(%77: i64 {tt.divisibility=16: i32},
%76: index,
%49: tensor<16x16x!tt.ptr<f16>, #AL> {tt.divisibility=16: i32, tt.contiguity=2 : i32},
Expand Down Expand Up @@ -597,6 +673,8 @@ tt.func @indirect_bmm_scalar_dist_one(%77: i64 {tt.divisibility=16: i32},
// AMD: triton_gpu.local_store %[[LOAD_56]], %[[MEMDESC_SUBVIEW_63]]
// AMD: scf.yield %[[DOT_58]], %[[ADDPTR_47]], %[[ADDPTR_48]], %[[SELECT_61]], %[[MEMDESC_SUBVIEW_62]], %[[LOAD_51]], %[[MEMDESC_SUBVIEW_63]]

// AMD_PREFETCH-LABEL: tt.func @indirect_bmm_vector

tt.func @indirect_bmm_vector(%77: tensor<16x16xi64, #BL> {tt.divisibility=16: i32, tt.constancy=16: i32},
%76: index,
%49: tensor<16x16x!tt.ptr<f16>, #AL> {tt.divisibility=16: i32, tt.contiguity=2 : i32},
Expand Down Expand Up @@ -1243,6 +1321,23 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
// AMD: triton_gpu.local_store
// AMD: scf.yield
// AMD: triton_gpu.local_dealloc

// AMD_PREFETCH-LABEL: tt.func public @nested_loops
// AMD_PREFETCH-NOT: triton_gpu.local_alloc
// AMD_PREFETCH: scf.for
// AMD_PREFETCH: triton_gpu.local_alloc
// AMD_PREFETCH: tt.load
// AMD_PREFETCH: triton_gpu.local_store
// AMD_PREFETCH: tt.load
// AMD_PREFETCH: triton_gpu.local_load
// AMD_PREFETCH: scf.for
// AMD_PREFETCH: triton_gpu.local_store
// AMD_PREFETCH: tt.load
// AMD_PREFETCH: tt.dot
// AMD_PREFETCH: triton_gpu.local_load
// AMD_PREFETCH: scf.yield
// AMD_PREFETCH: triton_gpu.local_dealloc

#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [2, 1], order = [1, 0]}>
#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 2], instrShape = [16, 8]}>
#shared = #triton_gpu.shared<{vec = 4, perPhase = 2, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}>
Expand Down Expand Up @@ -1578,6 +1673,20 @@ tt.func @matmul_nested_ops(%lb : index, %ub : index, %step : index,
// AMD: arith.select
// AMD: scf.yield

// AMD_PREFETCH-LABEL: @masked_add_kernel
// AMD_PREFETCH: %[[CONSTANT:.*]] = arith.constant dense<0xFF800000>
// AMD_PREFETCH-COUNT-6: tt.load {{.*}}, %{{.*}}, %[[CONSTANT]]
// AMD_PREFETCH: scf.for
// AMD_PREFETCH: %[[A:.*]] = tt.load {{.*}}, %{{.*}}, %[[CONSTANT]]
// AMD_PREFETCH: %[[B:.*]] = tt.load {{.*}}, %{{.*}}, %[[CONSTANT]]
// AMD_PREFETCH: arith.addf
// AMD_PREFETCH: arith.select
// AMD_PREFETCH: tt.store
// AMD_PREFETCH: scf.yield
// AMD_PREFETCH: tt.store
// AMD_PREFETCH: tt.store
// AMD_PREFETCH: tt.store

#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
tt.func public @masked_add_kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
Expand Down
19 changes: 8 additions & 11 deletions third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,11 @@ namespace {
// can 1) issue memory operations earlier to hide the latency and 2) break the
// strong dependency inside on loop iteration to give backends flexiblity to
// better interleave instructions for better instruction-level parallelism.
//
//
// This StreamPipeliner class creates the pipelining schedule and calls the
// PipelineExpander to rewrite the `scf.for` loop accordingly. A schedule consists
// of multiple stages, where ops from different stages can overlap executions
// because the dependencies are loop carried.
// PipelineExpander to rewrite the `scf.for` loop accordingly. A schedule
// consists of multiple stages, where ops from different stages can overlap
// executions because the dependencies are loop carried.
//
// The general flow of this process is:
//
Expand Down Expand Up @@ -95,7 +95,7 @@ namespace {
// ops in stages 1 to last_stage. This must consider that the loop
// bounds may be shorter than num_stages. In this case, the epilogue
// iterations must align with the prologue.
//
//
class StreamPipeliner {
public:
StreamPipeliner(scf::ForOp _forOp, int _numStages, bool _prefetch)
Expand Down Expand Up @@ -208,15 +208,12 @@ void StreamPipeliner::initSchedule(int maxIndirectionLevel) {
// compute: stage=i
// local_load: stage=i+1
// tail: stage=i
config[SCHED_LOCAL_STORE] = {lastStage - 1,
schedule.clusters.newAtBack()};
config[SCHED_LOCAL_STORE] = {lastStage - 1, schedule.clusters.newAtBack()};
auto cluster1 = schedule.clusters.newAtBack();
config[SCHED_GLOBAL_LOAD] = {0, cluster1};
config[SCHED_COMPUTE] = {lastStage, cluster1};
config[SCHED_LOCAL_LOAD] = {lastStage - 1,
schedule.clusters.newAtBack()};
config[SCHED_TAIL] = {lastStage,
schedule.clusters.newAtBack()};
config[SCHED_LOCAL_LOAD] = {lastStage - 1, schedule.clusters.newAtBack()};
config[SCHED_TAIL] = {lastStage, schedule.clusters.newAtBack()};
} else if (isMultibuf) {
// Streaming Schema cluster order and staging for multi-buffer.
// for i in (...):
Expand Down

0 comments on commit f7200cb

Please sign in to comment.