Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PIPELINER] Cleanup of LoopScheduling.cpp, introduction of AssignLatencies #5176

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions include/triton/Dialect/TritonGPU/Transforms/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,38 @@ def TritonGPUPipeline : Pass<"tritongpu-pipeline", "mlir::ModuleOp"> {
];
}

def TritonGPUTestPipelineAssignLatencies : Pass<"tritongpu-test-pipeline-assign-latencies", "mlir::ModuleOp"> {
let summary = "test assigning latencies to interesting ops ahead of pipelining";

let description = [{
This is a test pass that tests `assignLatencies` method of `TritonGPULoopScheduling`.
}];

let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
"mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect",
"mlir::scf::SCFDialect",
"mlir::arith::ArithDialect"];

let options = [
Option<"numStages", "num-stages",
"int32_t", /*default*/"3",
"number of pipeline stages">
];
}

def TritonGPUTestPipelineScheduleLoop : Pass<"tritongpu-test-pipeline-schedule-loop", "mlir::ModuleOp"> {
let summary = "test scheduling a loop for software pipelining";

let description = [{
This is a test pass that tests `scheduleLoop` method of `TritonGPULoopScheduling`.
}];

let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
"mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect",
"mlir::scf::SCFDialect",
"mlir::arith::ArithDialect"];
}

def TritonGPUF32DotTC : Pass<"tritongpu-F32DotTC", "mlir::ModuleOp"> {
let summary = "3xTF32 trick";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ static const char *kNumStagesAttrName = "tt.num_stages";
static const char *kLoopStageAttrName = "loop.stage";
static const char *kLoopClusterAttrName = "loop.cluster";

bool loopHasDistGreaterThanOne(scf::ForOp forOp);
bool isOuterLoop(scf::ForOp forOp);

/// Function to mask operations during scheduling.
Operation *predicateOp(RewriterBase &rewriter, Operation *op, Value pred);

Expand Down
15 changes: 13 additions & 2 deletions include/triton/Dialect/TritonGPU/Transforms/Schedule.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,18 @@
namespace mlir {
namespace triton {

namespace gpu {

/// Discover operations that should become async and assign latencies to them
/// based on the numStages value provided by the user.
DenseMap<Operation *, int> assignLatencies(ModuleOp forOp, int numStages);

/// Schedule the loop based on the latencies assigned to the operations.
void scheduleLoop(scf::ForOp forOp,
const DenseMap<Operation *, int> &opLatency);

}; // namespace gpu

/// This fill out the pipelining options including schedule and annotations
/// for wait ops. This also does pre-processing by converting some of the
/// loads into async loads so that the IR is ready to be pipelined.
Expand Down Expand Up @@ -108,8 +120,7 @@ class CoarseSchedule {

// Add dependencies of anchor ops to the coarse schedule. Schedule them to
// the same stage and ordering cluster as the anchor op.
void scheduleDependencies(scf::ForOp forOp, CoarseSchedule &schedule,
int numStages);
void scheduleDependencies(scf::ForOp forOp, CoarseSchedule &schedule);

} // namespace triton
} // namespace mlir
Expand Down
3 changes: 3 additions & 0 deletions lib/Dialect/TritonGPU/Transforms/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,12 @@ add_triton_library(TritonGPUTransforms
OptimizeAccumulatorInit.cpp
OptimizeDotOperands.cpp
OptimizeThreadLocality.cpp
Pipeliner/AssignLatencies.cpp
Pipeliner/MatmulLoopPipeline.cpp
Pipeliner/OuterLoopPipeline.cpp
Pipeliner/PipelineExpander.cpp
Pipeliner/TestPipelineAssignLatencies.cpp
Pipeliner/TestPipelineScheduleLoop.cpp
Pipeliner/SoftwarePipeliner.cpp
Pipeliner/TMAStoresPipeline.cpp
Pipeliner/PipeliningUtility.cpp
Expand Down
Loading
Loading