Skip to content

Commit

Permalink
[DmaLoopSubsumption] Relax circular dma loop subsumption condition (#826
Browse files Browse the repository at this point in the history
)

1. This PR relaxes the condition for circular dma ops loop subsumption,
so that npu.circular_dma_cpy_nd ops can be hoisted out of the loop even
if there is other npu.dma_cpy_nd user of the same connection op after
it.
2. With this change, we can further subsume loops and hoist
npu.dma_cpy_nd ops out of the loop. This PR makes use of
#812 and brings the dma
optimizations in Passes.cpp.
  • Loading branch information
yzhang93 authored Oct 10, 2024
1 parent 17f9a01 commit a533e7d
Show file tree
Hide file tree
Showing 7 changed files with 256 additions and 106 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
#include "iree-amd-aie/IR/AMDAIEDialect.h"
#include "iree-amd-aie/Transforms/AMDAIEDmaUtils.h"
#include "iree-amd-aie/Transforms/Passes.h"
#include "iree-amd-aie/Transforms/Transforms.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"

#define DEBUG_TYPE "iree-amdaie-canonicalize-doubly-strided-dma"

Expand All @@ -17,90 +19,105 @@ namespace {

/// Recognize linear accesses across multiple DMA access dimensions and fold
/// them.
LogicalResult foldDmaOpLinearDims(RewriterBase &rewriter,
AMDAIE::DoublyStridedOpInterface op) {
OpBuilder::InsertionGuard guard(rewriter);
SmallVector<OpFoldResult> sourceOffsets = op.getSourceMixedOffsets();
SmallVector<OpFoldResult> sourceSizes = op.getSourceMixedSizes();
SmallVector<OpFoldResult> sourceStrides = op.getSourceMixedStrides();
SmallVector<OpFoldResult> targetOffsets = op.getTargetMixedOffsets();
SmallVector<OpFoldResult> targetSizes = op.getTargetMixedSizes();
SmallVector<OpFoldResult> targetStrides = op.getTargetMixedStrides();
SmallVector<OpFoldResult> newSourceOffsets, newSourceSizes, newSourceStrides,
newTargetOffsets, newTargetSizes, newTargetStrides;
LogicalResult sourceRes =
foldLinearDims(op.getContext(), sourceOffsets, sourceSizes, sourceStrides,
newSourceOffsets, newSourceSizes, newSourceStrides);
LogicalResult targetRes =
foldLinearDims(op.getContext(), targetOffsets, targetSizes, targetStrides,
newTargetOffsets, newTargetSizes, newTargetStrides);
if (failed(sourceRes) && failed(targetRes)) {
return failure();
struct FoldDmaOpLinearDims
: public OpInterfaceRewritePattern<AMDAIE::DoublyStridedOpInterface> {
using OpInterfaceRewritePattern::OpInterfaceRewritePattern;

LogicalResult matchAndRewrite(AMDAIE::DoublyStridedOpInterface op,
PatternRewriter &rewriter) const override {
OpBuilder::InsertionGuard guard(rewriter);
SmallVector<OpFoldResult> sourceOffsets = op.getSourceMixedOffsets();
SmallVector<OpFoldResult> sourceSizes = op.getSourceMixedSizes();
SmallVector<OpFoldResult> sourceStrides = op.getSourceMixedStrides();
SmallVector<OpFoldResult> targetOffsets = op.getTargetMixedOffsets();
SmallVector<OpFoldResult> targetSizes = op.getTargetMixedSizes();
SmallVector<OpFoldResult> targetStrides = op.getTargetMixedStrides();
SmallVector<OpFoldResult> newSourceOffsets, newSourceSizes,
newSourceStrides, newTargetOffsets, newTargetSizes, newTargetStrides;
LogicalResult sourceRes = foldLinearDims(
op.getContext(), sourceOffsets, sourceSizes, sourceStrides,
newSourceOffsets, newSourceSizes, newSourceStrides);
LogicalResult targetRes = foldLinearDims(
op.getContext(), targetOffsets, targetSizes, targetStrides,
newTargetOffsets, newTargetSizes, newTargetStrides);
if (failed(sourceRes) && failed(targetRes)) {
return failure();
}

rewriter.setInsertionPointAfter(op);
auto newDoublyStridedOp = op.createDoublyStridedOp(
rewriter, newTargetOffsets, newTargetSizes, newTargetStrides,
newSourceOffsets, newSourceSizes, newSourceStrides);
rewriter.replaceOp(op, newDoublyStridedOp.getOperation());
return success();
}

rewriter.setInsertionPointAfter(op);
auto newDoublyStridedOp = op.createDoublyStridedOp(
rewriter, newTargetOffsets, newTargetSizes, newTargetStrides,
newSourceOffsets, newSourceSizes, newSourceStrides);
rewriter.replaceOp(op, newDoublyStridedOp.getOperation());
return success();
}
};

/// Fold single dimension linear accesses and make them implicit.
LogicalResult foldDmaOpSingleDims(RewriterBase &rewriter,
AMDAIE::DoublyStridedOpInterface op) {
OpBuilder::InsertionGuard guard(rewriter);
SmallVector<OpFoldResult> sourceOffsets = op.getSourceMixedOffsets();
SmallVector<OpFoldResult> sourceSizes = op.getSourceMixedSizes();
SmallVector<OpFoldResult> sourceStrides = op.getSourceMixedStrides();
SmallVector<OpFoldResult> targetOffsets = op.getTargetMixedOffsets();
SmallVector<OpFoldResult> targetSizes = op.getTargetMixedSizes();
SmallVector<OpFoldResult> targetStrides = op.getTargetMixedStrides();
LogicalResult sourceRes =
foldSingleDim(sourceOffsets, sourceSizes, sourceStrides);
LogicalResult targetRes =
foldSingleDim(targetOffsets, targetSizes, targetStrides);
if (failed(sourceRes) && failed(targetRes)) {
return failure();
struct FoldDmaOpSingleDims
: public OpInterfaceRewritePattern<AMDAIE::DoublyStridedOpInterface> {
using OpInterfaceRewritePattern::OpInterfaceRewritePattern;

LogicalResult matchAndRewrite(AMDAIE::DoublyStridedOpInterface op,
PatternRewriter &rewriter) const override {
OpBuilder::InsertionGuard guard(rewriter);
SmallVector<OpFoldResult> sourceOffsets = op.getSourceMixedOffsets();
SmallVector<OpFoldResult> sourceSizes = op.getSourceMixedSizes();
SmallVector<OpFoldResult> sourceStrides = op.getSourceMixedStrides();
SmallVector<OpFoldResult> targetOffsets = op.getTargetMixedOffsets();
SmallVector<OpFoldResult> targetSizes = op.getTargetMixedSizes();
SmallVector<OpFoldResult> targetStrides = op.getTargetMixedStrides();
LogicalResult sourceRes =
foldSingleDim(sourceOffsets, sourceSizes, sourceStrides);
LogicalResult targetRes =
foldSingleDim(targetOffsets, targetSizes, targetStrides);
if (failed(sourceRes) && failed(targetRes)) {
return failure();
}

rewriter.setInsertionPointAfter(op);
auto newDoublyStridedOp = op.createDoublyStridedOp(
rewriter, targetOffsets, targetSizes, targetStrides, sourceOffsets,
sourceSizes, sourceStrides);
rewriter.replaceOp(op, newDoublyStridedOp.getOperation());
return success();
}

rewriter.setInsertionPointAfter(op);
auto newDoublyStridedOp = op.createDoublyStridedOp(
rewriter, targetOffsets, targetSizes, targetStrides, sourceOffsets,
sourceSizes, sourceStrides);
rewriter.replaceOp(op, newDoublyStridedOp.getOperation());
return success();
}
};

/// Fold unit dimensions within a strided access pattern.
LogicalResult foldDmaOpUnitDims(RewriterBase &rewriter,
AMDAIE::DoublyStridedOpInterface op) {
OpBuilder::InsertionGuard guard(rewriter);
SmallVector<OpFoldResult> sourceOffsets = op.getSourceMixedOffsets();
SmallVector<OpFoldResult> sourceSizes = op.getSourceMixedSizes();
SmallVector<OpFoldResult> sourceStrides = op.getSourceMixedStrides();
SmallVector<OpFoldResult> targetOffsets = op.getTargetMixedOffsets();
SmallVector<OpFoldResult> targetSizes = op.getTargetMixedSizes();
SmallVector<OpFoldResult> targetStrides = op.getTargetMixedStrides();
SmallVector<OpFoldResult> newSourceOffsets, newSourceSizes, newSourceStrides,
newTargetOffsets, newTargetSizes, newTargetStrides;
LogicalResult sourceRes =
foldUnitDims(sourceOffsets, sourceSizes, sourceStrides, newSourceOffsets,
newSourceSizes, newSourceStrides);
LogicalResult targetRes =
foldUnitDims(targetOffsets, targetSizes, targetStrides, newTargetOffsets,
newTargetSizes, newTargetStrides);
if (failed(sourceRes) && failed(targetRes)) {
return failure();
struct FoldDmaOpUnitDims
: public OpInterfaceRewritePattern<AMDAIE::DoublyStridedOpInterface> {
using OpInterfaceRewritePattern::OpInterfaceRewritePattern;

LogicalResult matchAndRewrite(AMDAIE::DoublyStridedOpInterface op,
PatternRewriter &rewriter) const override {
OpBuilder::InsertionGuard guard(rewriter);
SmallVector<OpFoldResult> sourceOffsets = op.getSourceMixedOffsets();
SmallVector<OpFoldResult> sourceSizes = op.getSourceMixedSizes();
SmallVector<OpFoldResult> sourceStrides = op.getSourceMixedStrides();
SmallVector<OpFoldResult> targetOffsets = op.getTargetMixedOffsets();
SmallVector<OpFoldResult> targetSizes = op.getTargetMixedSizes();
SmallVector<OpFoldResult> targetStrides = op.getTargetMixedStrides();
SmallVector<OpFoldResult> newSourceOffsets, newSourceSizes,
newSourceStrides, newTargetOffsets, newTargetSizes, newTargetStrides;
LogicalResult sourceRes =
foldUnitDims(sourceOffsets, sourceSizes, sourceStrides,
newSourceOffsets, newSourceSizes, newSourceStrides);
LogicalResult targetRes =
foldUnitDims(targetOffsets, targetSizes, targetStrides,
newTargetOffsets, newTargetSizes, newTargetStrides);
if (failed(sourceRes) && failed(targetRes)) {
return failure();
}

rewriter.setInsertionPointAfter(op);
auto newDoublyStridedOp = op.createDoublyStridedOp(
rewriter, newTargetOffsets, newTargetSizes, newTargetStrides,
newSourceOffsets, newSourceSizes, newSourceStrides);
rewriter.replaceOp(op, newDoublyStridedOp.getOperation());
return success();
}

rewriter.setInsertionPointAfter(op);
auto newDoublyStridedOp = op.createDoublyStridedOp(
rewriter, newTargetOffsets, newTargetSizes, newTargetStrides,
newSourceOffsets, newSourceSizes, newSourceStrides);
rewriter.replaceOp(op, newDoublyStridedOp.getOperation());
return success();
}
};

class AMDAIECanonicalizeDoublyStridedOpPass
: public impl::AMDAIECanonicalizeDoublyStridedOpBase<
Expand All @@ -121,30 +138,28 @@ class AMDAIECanonicalizeDoublyStridedOpPass

void AMDAIECanonicalizeDoublyStridedOpPass::runOnOperation() {
Operation *parentOp = getOperation();
IRRewriter rewriter(parentOp->getContext());

// Fold DMA unit dimensions. Needs to happen before folding linear dimensions
// to avoid blocking detection of linear dimension folding opportunities due
// to a unit dimension in between.
parentOp->walk([&](AMDAIE::DoublyStridedOpInterface dmaOp) {
(void)foldDmaOpUnitDims(rewriter, dmaOp);
});
MLIRContext *context = &getContext();
RewritePatternSet patterns(context);

populateCanonicalizeDoublyStridedOpPatterns(patterns, foldSingleDims);
if (failed(applyPatternsAndFoldGreedily(parentOp, std::move(patterns)))) {
parentOp->emitOpError(
"failed to canonicalize doubly strided DMA operations");
return signalPassFailure();
}
}

// Fold linear dimensions within a DMA op.
parentOp->walk([&](AMDAIE::DoublyStridedOpInterface dmaOp) {
(void)foldDmaOpLinearDims(rewriter, dmaOp);
});
} // namespace

// Make DMA accesses with single dimension implicit.
void populateCanonicalizeDoublyStridedOpPatterns(RewritePatternSet &patterns,
bool foldSingleDims) {
patterns.add<FoldDmaOpUnitDims>(patterns.getContext());
patterns.add<FoldDmaOpLinearDims>(patterns.getContext());
if (foldSingleDims) {
parentOp->walk([&](AMDAIE::DoublyStridedOpInterface dmaOp) {
(void)foldDmaOpSingleDims(rewriter, dmaOp);
});
patterns.add<FoldDmaOpSingleDims>(patterns.getContext());
}
}

} // namespace

std::unique_ptr<Pass> createAMDAIECanonicalizeDoublyStridedOpPass(
AMDAIECanonicalizeDoublyStridedOpOptions options) {
return std::make_unique<AMDAIECanonicalizeDoublyStridedOpPass>(options);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ void AMDAIEDmaCompositionPass::runOnOperation() {
onlyZeroStrideOnOuterDim);
}
populateStridedOpCombinationPattern(patterns);
populateCanonicalizeDoublyStridedOpPatterns(patterns, false);
if (failed(applyPatternsAndFoldGreedily(parentOp, std::move(patterns)))) {
parentOp->emitOpError("failed to compose strided operations");
return signalPassFailure();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -492,7 +492,7 @@ struct SubsumeLoopIntoDMA
if (!isa<LoopLikeOpInterface>(parentOp))
return rewriter.notifyMatchFailure(op, "Parent is not a loop-like op");

auto hasUsersInSameScope = [&](Value result) -> bool {
auto hasOtherUsersInSameScope = [&](Value result) -> bool {
for (Operation *userOp : result.getUsers()) {
if (userOp != op.getOperation() && parentOp->isProperAncestor(userOp)) {
return true;
Expand All @@ -501,6 +501,25 @@ struct SubsumeLoopIntoDMA
return false;
};

auto hasCircularUsersInSameScope =
[&](SmallVector<AMDAIE::DoublyStridedOpInterface> users) -> bool {
bool currentCircularDma = false;
for (AMDAIE::DoublyStridedOpInterface userOp : llvm::reverse(users)) {
// Check if there is other circular dma user in the same scope.
if (isa<AMDAIE::NpuCircularDmaCpyNdOp>(userOp) &&
userOp != op.getOperation()) {
return true;
}
// Check if there is other user before the current in the same scope.
if (userOp == op.getOperation()) {
currentCircularDma = true;
continue;
}
if (currentCircularDma) return true;
}
return false;
};

uint8_t sourceMemspaceInt;
uint8_t targetMemspaceInt;
if (auto npuDmaOp = dyn_cast<AMDAIE::NpuDmaCpyNdOp>(op.getOperation())) {
Expand All @@ -526,7 +545,7 @@ struct SubsumeLoopIntoDMA
"merged with other connections, so abort loop subsumption as it "
"could potentially lead to deadlocks");
}
if (hasUsersInSameScope(connectionOp.getResult())) {
if (hasOtherUsersInSameScope(connectionOp.getResult())) {
return rewriter.notifyMatchFailure(
op,
"Has users of same DMA in scope, analysis to check validity of "
Expand All @@ -538,16 +557,28 @@ struct SubsumeLoopIntoDMA
sourceMemspaceInt = npuCircularDmaOp.getSourceMemorySpaceAsUInt();
targetMemspaceInt = npuCircularDmaOp.getTargetMemorySpaceAsUInt();

// Check that the connection this `amdaie.npu.dma_cpy_nd` operation is
// operating on, is not being touched within the same scope. Otherwise,
// the rewrite is not valid in general as it would be changing the
// temporal usage of the source connection.
// Check that the connection this `amdaie.npu.circular_dma_cpy_nd` op is
// operating on, satisfies the following conditions:
// 1) No other users of the connection has the Circular trait in the same
// scope; 2) No other users of the connection before this circular dma op
// in the same scope. Otherwise, the rewrite is not valid in general as it
// would be changing the temporal usage of the source connection.
AMDAIE::ConnectionOp connectionOp = npuCircularDmaOp.getConnectionOp();
if (!connectionOp) {
return rewriter.notifyMatchFailure(
op, "should operate on an `amdaie.connection` op");
}
if (hasUsersInSameScope(connectionOp.getResult())) {
// Walk all dma ops in order and get those which are the users of the
// current connection op.
SmallVector<AMDAIE::DoublyStridedOpInterface> dmaUsers;
parentOp->walk([&](AMDAIE::DoublyStridedOpInterface op) {
auto dmaConnection = dyn_cast_if_present<AMDAIE::ConnectionOp>(
op->getOperand(0).getDefiningOp());
if (dmaConnection && dmaConnection == connectionOp) {
dmaUsers.push_back(op);
}
});
if (hasCircularUsersInSameScope(dmaUsers)) {
return rewriter.notifyMatchFailure(
op,
"Has users of same DMA in scope, analysis to check validity of "
Expand Down
18 changes: 15 additions & 3 deletions compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,8 @@ void buildAMDAIETransformPassPipeline(
}
modulePassManager.addPass(createLowerUKernelOpsToCallsPass());
if (useLowerToAIEPipeline == LowerToAIEPassPipeline::ObjectFifo) {
addAMDAIEObjectFifoLoweringPasses(modulePassManager, enablePacketFlow);
addAMDAIEObjectFifoLoweringPasses(modulePassManager, enablePacketFlow,
useTilePipeline);
} else if (useLowerToAIEPipeline == LowerToAIEPassPipeline::AIR) {
addMLIRAIRLoweringPasses(modulePassManager, device, useTilePipeline,
matmulElementwiseFusion);
Expand All @@ -541,11 +542,22 @@ void buildAMDAIETransformPassPipeline(
}

void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager,
bool enablePacketFlow) {
bool enablePacketFlow,
TilePassPipeline useTilePipeline) {
passManager.addPass(createEraseHALDescriptorTypeFromMemRefPass());
passManager.addPass(memref::createFoldMemRefAliasOpsPass());
passManager.addPass(createCanonicalizerPass());
passManager.addPass(createAMDAIEConvertToDmaPass());
// For matmul pipelines, we do transpose on target side for pack ops to get
// better performance. While for convolution pipelines, the same settings
// cause 'aie.dma_bd' error, so for now keep using transpose on source for
// both pack and unpack ops.
// TODO(vivian): explore the other options for conv ops.
AMDAIEConvertToDmaOptions dmaOptions;
dmaOptions.packTransposeOnSource =
(useTilePipeline == TilePassPipeline::ConvDecomposePipeline) ? true
: false;
dmaOptions.unpackTransposeOnSource = true;
passManager.addPass(createAMDAIEConvertToDmaPass(dmaOptions));

passManager.addPass(createAMDAIENormalizeLoopBoundsPass());
passManager.addPass(createAMDAIEInsertCoresPass());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ namespace mlir::iree_compiler::AMDAIE {

/// Add passes to lower to AIE objectFifos.
void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager,
bool enablePacketFlow);
bool enablePacketFlow,
TilePassPipeline useTilePipeline);

/// Add passes to lower from MLIR-AIR through AIE. This is
/// currently the default passes used for lowering after IREEs tiling.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ LogicalResult normalizeLoopBounds(RewriterBase &rewriter, scf::ForOp forOp);
LogicalResult normalizeLoopBounds(RewriterBase &rewriter,
scf::ForallOp forallOp);

/// Populate patterns that canonicalize doubly strided DMA operations.
void populateCanonicalizeDoublyStridedOpPatterns(RewritePatternSet &patterns,
bool foldSingleDims);

/// Populate patterns that subsume loops iterations into DMA access patterns.
void populateDmaLoopSubsumptionPattern(RewritePatternSet &patterns,
AMDAIE::AMDAIEDeviceModel &&deviceModel,
Expand Down
Loading

0 comments on commit a533e7d

Please sign in to comment.