Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SplitLogicalObjFifo] Fix split-logicalobjfifo pass to analyse unique producers/consumers ObjFifos #1060

Open
wants to merge 20 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions build_tools/ci/cpu_comparison/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -1717,6 +1717,22 @@ def __init__(self):
use_chess=False,
)
)
self.register(
Matmul(
512,
512,
256,
"i32",
"i32",
name_suffix="4rows_8cols_npu4_pack_peel_4_level_tiling",
tile_pipeline="pack-peel-4-level-tiling",
run_on_target=["npu4"],
aie_compilation_flags=[
"--iree-amdaie-num-rows=4",
"--iree-amdaie-num-cols=8",
],
)
)

for target in ["npu1_4col", "npu4"]:
self.register(
Expand Down Expand Up @@ -1782,6 +1798,24 @@ def __init__(self):
additional_labels=["I8UKernel"],
)
)
self.register(
Matmul(
64,
64,
64,
"bf16",
"f32",
name_suffix="4rows_8cols_npu4_pack_peel_4_level_tiling_ukernel",
use_ukernel=True,
tile_pipeline="pack-peel-4-level-tiling",
run_on_target=["npu4"],
aie_compilation_flags=[
"--iree-amdaie-num-rows=4",
"--iree-amdaie-num-cols=8",
],
use_chess=True,
Abhishek-Varma marked this conversation as resolved.
Show resolved Hide resolved
)
)

# Matmul test on 2(rows)x2(cols) cores
self.register(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ SmallVector<mlir::CopyOpInterface> getCopyLikeConsumers(
SmallVector<mlir::CopyOpInterface> copyLikOps;
for (Operation *userOp : op->getUsers()) {
if (auto copyOp = dyn_cast<CopyOpInterface>(userOp);
dyn_cast_if_present<LogicalObjFifoOpInterface>(
copyOp.getSource().getDefiningOp()) == op) {
copyOp && dyn_cast_if_present<LogicalObjFifoOpInterface>(
copyOp.getSource().getDefiningOp()) == op) {
copyLikOps.push_back(copyOp);
}
}
Expand All @@ -31,8 +31,8 @@ SmallVector<mlir::CopyOpInterface> getCopyLikeProducers(
SmallVector<mlir::CopyOpInterface> copyLikOps;
for (Operation *userOp : op->getUsers()) {
if (auto copyOp = dyn_cast<CopyOpInterface>(userOp);
dyn_cast_if_present<LogicalObjFifoOpInterface>(
copyOp.getTarget().getDefiningOp()) == op) {
copyOp && dyn_cast_if_present<LogicalObjFifoOpInterface>(
copyOp.getTarget().getDefiningOp()) == op) {
copyLikOps.push_back(copyOp);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,7 @@ extern "C" {
}

matmul_combos(matmul_vectorized_c_func, 16, 8, 32)
matmul_combos(matmul_vectorized_c_func, 16, 8, 64)
matmul_combos(matmul_vectorized_c_func, 16, 16, 32)
matmul_combos(matmul_vectorized_c_func, 32, 32, 32)
matmul_combos(matmul_vectorized_c_func, 32, 32, 64)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,45 @@ FailureOr<int64_t> getSplitStride(ArrayRef<AMDAIE::DmaCpyNdOp> dmaOps,
return splitStride;
}

/// Given a list of Copy Ops, fetch the total no. of unique consumer/producer
/// LogicalObjectFifos. This would helps us figure out the split factor for
/// LogicalObjectFifos.
/// And example case which necessitated this feature :-
/// %lhs = LOF_on_L2
/// %a = LOF_on_L1_0
/// %b = LOF_on_L1_1
/// %c = LOF_on_L1_2
/// DMA(%a, %lhs)
/// DMA(%b, %lhs)
/// DMA(%c, %lhs)
/// DMA(%b, %lhs)
/// DMA(%c, %lhs)
///
/// In the above snippet although we have 5 DMA ops for L2<->L1, only 3 of
/// them are unique. Hence we'd split %lhs into 3 unique splits, instead
/// of 5.
template <CopyOpOperateOn OperateOn>
static FailureOr<int64_t> fetchTotalUniqueLogicalObjFifoUsers(
SmallVector<CopyOpInterface> copyLikeOps) {
DenseSet<Operation *> uniqueLof;
for (CopyOpInterface copyOp : copyLikeOps) {
AMDAIE::LogicalObjectFifoFromMemrefOp lof = nullptr;
if constexpr (OperateOn == CopyOpOperateOn::Target) {
lof = dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>(
copyOp.getTarget().getDefiningOp());
} else {
lof = dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>(
copyOp.getSource().getDefiningOp());
}
if (!lof) {
return copyOp.emitOpError()
<< "could not retrieve source/target objectFifo";
}
uniqueLof.insert(lof);
}
return uniqueLof.size();
}

/// Find the logical objectFifo and DMA source/target splitting dimensions for
/// each DMA and objectFifo pair.
///
Expand All @@ -138,6 +177,9 @@ FailureOr<int64_t> getSplitStride(ArrayRef<AMDAIE::DmaCpyNdOp> dmaOps,
/// that has product size larger than the other side's product size after
/// splitting because that's the number of elements that should be
/// produced/consumed on the respective sides before splitting.
/// Towards the end fetch the count of unique L2<->L1 for the objectFifo which
Abhishek-Varma marked this conversation as resolved.
Show resolved Hide resolved
/// will be split. This would form the split factor which would be capped by the
/// total no. of columns OR std::gcd of source/target size.
LogicalResult collectSplittingDims(
const SmallVector<DmaObjFifoPairT> &dmaObjFifoPairs,
DenseMap<AMDAIE::DmaCpyNdOp, DmaSplitInfo> &dmaSplitInfoMap,
Expand Down Expand Up @@ -218,17 +260,31 @@ LogicalResult collectSplittingDims(
// Calculate the new source stride to be used for splitting the DMA.
int64_t newSourceStride =
splitStride != 1 ? splitDimSize / splitStride : 1;
FailureOr<int64_t> maybeNumUniqueConsumers =
fetchTotalUniqueLogicalObjFifoUsers<CopyOpOperateOn::Target>(
objFifo.getCopyLikeConsumers());
if (failed(maybeNumUniqueConsumers)) {
objFifo.emitOpError()
<< "could not retrieve total unique L2<->L1 pairs";
Abhishek-Varma marked this conversation as resolved.
Show resolved Hide resolved
}
int64_t splitFactor = std::gcd(*maybeNumUniqueConsumers, numCols);
int64_t sourceSize = (*sourceSizes)[sourceSplitDim];
int64_t targetSize = (*targetSizes)[targetSplitDim];
if (sourceSize % splitFactor != 0 || targetSize % splitFactor != 0) {
splitFactor = std::gcd(sourceSize, targetSize);
}
LLVM_DEBUG(llvm::dbgs() << "sourceSplitDim: " << sourceSplitDim << "\n");
LLVM_DEBUG(llvm::dbgs() << "targetSplitDim: " << targetSplitDim << "\n");
LLVM_DEBUG(llvm::dbgs()
<< "newSourceStride: " << newSourceStride << "\n");
LLVM_DEBUG(llvm::dbgs()
<< "objFifoSplitDim: " << objFifoSplitDim << "\n");
LLVM_DEBUG(llvm::dbgs() << "splitStride: " << splitStride << "\n");
LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << numCols << "\n");
LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << splitFactor << "\n");
dmaSplitInfoMap[dmaOp] = {sourceSplitDim, newSourceStride, targetSplitDim,
1, numCols};
objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, numCols, splitStride};
1, splitFactor};
objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, splitFactor,
splitStride};
} else if (dmaOp.getSourceObjectFifo() == objFifo) {
// Find outermost dimension in the access pattern that has stride ==
// sizeAfterSplit and size != 1.
Expand Down Expand Up @@ -274,17 +330,31 @@ LogicalResult collectSplittingDims(
// Calculate the new target stride to be used for splitting the DMA.
int64_t newTargetStride =
splitStride != 1 ? splitDimSize / splitStride : 1;
FailureOr<int64_t> maybeNumUniqueProducers =
fetchTotalUniqueLogicalObjFifoUsers<CopyOpOperateOn::Source>(
objFifo.getCopyLikeProducers());
if (failed(maybeNumUniqueProducers)) {
objFifo.emitOpError()
<< "could not retrieve total unique L2<->L1 pairs";
}
int64_t splitFactor = std::gcd(*maybeNumUniqueProducers, numCols);
int64_t sourceSize = (*sourceSizes)[sourceSplitDim];
int64_t targetSize = (*targetSizes)[targetSplitDim];
if (sourceSize % splitFactor != 0 || targetSize % splitFactor != 0) {
splitFactor = std::gcd(sourceSize, targetSize);
}
LLVM_DEBUG(llvm::dbgs() << "sourceSplitDim: " << sourceSplitDim << "\n");
LLVM_DEBUG(llvm::dbgs() << "targetSplitDim: " << targetSplitDim << "\n");
LLVM_DEBUG(llvm::dbgs()
<< "newTargetStride: " << newTargetStride << "\n");
LLVM_DEBUG(llvm::dbgs()
<< "objFifoSplitDim: " << objFifoSplitDim << "\n");
LLVM_DEBUG(llvm::dbgs() << "splitStride: " << splitStride << "\n");
LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << numCols << "\n");
LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << splitFactor << "\n");
dmaSplitInfoMap[dmaOp] = {sourceSplitDim, 1, targetSplitDim,
newTargetStride, numCols};
objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, numCols, splitStride};
newTargetStride, splitFactor};
objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, splitFactor,
splitStride};
}
}
return success();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -755,7 +755,7 @@ LogicalResult splitLogicalObjectFifo(IRRewriter &rewriter,
LogicalResult splitDoublyStridedOp(IRRewriter &rewriter,
AMDAIE::DoublyStridedOpInterface op,
size_t sourceSplitDim, size_t targetSplitDim,
std::optional<size_t> maybeSplitFactor,
int64_t splitFactor,
int64_t sourceSplitStride,
int64_t targetSplitStride) {
if (!op->use_empty())
Expand Down Expand Up @@ -800,15 +800,6 @@ LogicalResult splitDoublyStridedOp(IRRewriter &rewriter,
}
int64_t sourceSize = maybeSourceSize.value();
int64_t targetSize = maybeTargetSize.value();
int64_t splitFactor = maybeSplitFactor.has_value()
? maybeSplitFactor.value()
: std::gcd(sourceSize, targetSize);
if (sourceSize % splitFactor != 0 || targetSize % splitFactor != 0) {
int64_t newSplitFactor = std::gcd(sourceSize, targetSize);
LLVM_DEBUG(llvm::dbgs() << "split factor has been changed from "
<< splitFactor << " to " << newSplitFactor);
splitFactor = newSplitFactor;
}

int64_t newSourceSize = sourceSize / splitFactor;
int64_t newTargetSize = targetSize / splitFactor;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,14 @@ LogicalResult splitLogicalObjectFifo(
int64_t splitStride = 1);

/// Split doubly strided operations on a source and target split dimension with
/// the provided split factor. If no split factor is provided, the doubly
/// strided operation will be split on the size of the dimension being split.
LogicalResult splitDoublyStridedOp(
IRRewriter &rewriter, AMDAIE::DoublyStridedOpInterface op,
size_t sourceSplitDim = 0, size_t targetSplitDim = 0,
std::optional<size_t> splitFactor = std::nullopt,
int64_t sourceSplitStride = 1, int64_t targetSplitStride = 1);
/// the provided split factor.
LogicalResult splitDoublyStridedOp(IRRewriter &rewriter,
AMDAIE::DoublyStridedOpInterface op,
size_t sourceSplitDim = 0,
size_t targetSplitDim = 0,
int64_t splitFactor = 1,
int64_t sourceSplitStride = 1,
int64_t targetSplitStride = 1);

} // namespace mlir::iree_compiler::AMDAIE

Expand Down
Loading
Loading