Skip to content

Commit 64a07f8

Browse files
authored
[BACKEND] Fix missing barrier before tcgen05.copy (#6061)
We need more accurate modeling of the memory effect for membar to insert a barrier.
1 parent 27bcf56 commit 64a07f8

File tree

4 files changed

+30
-10
lines changed

4 files changed

+30
-10
lines changed

include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -417,7 +417,7 @@ def TTNG_TMEMAllocOp : TTNG_Op<"tmem_alloc", [DeclareOpInterfaceMethods<MemoryEf
417417
let hasVerifier = 1;
418418
}
419419

420-
def TTNG_TMEMCopyOp : TTNG_Op<"tmem_copy", [MemoryEffects<[MemWrite]>]> {
420+
def TTNG_TMEMCopyOp : TTNG_Op<"tmem_copy", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
421421
let summary = "Initiate an asynchronous copy operation from shared memory to the Tensor Memory.";
422422

423423
let description = [{

lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -388,7 +388,6 @@ LogicalResult TMEMAllocOp::verify() {
388388
return success();
389389
}
390390

391-
// TMEMAllocOp
392391
void TMEMAllocOp::getEffects(
393392
SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
394393
&effects) {
@@ -452,6 +451,15 @@ LogicalResult TMEMCopyOp::verify() {
452451
return success();
453452
}
454453

454+
void TMEMCopyOp::getEffects(
455+
SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
456+
&effects) {
457+
effects.emplace_back(MemoryEffects::Write::get(),
458+
mlir::triton::nvidia_gpu::TensorMemory::get());
459+
effects.emplace_back(MemoryEffects::Read::get(), &getSrcMutable(),
460+
mlir::triton::gpu::SharedMemory::get());
461+
}
462+
455463
} // namespace nvidia_gpu
456464
} // namespace triton
457465
} // namespace mlir

python/test/unit/language/test_matmul.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -445,21 +445,13 @@ def block_scale_mxfp_matmul( #
445445
tl.store(output_ptrs, accumulator, mask=c_mask)
446446

447447

448-
def _knob_disable_ptxas_opt(monkeypatch):
449-
monkeypatch.setenv("DISABLE_PTXAS_OPT", "1")
450-
451-
452448
@pytest.mark.parametrize("M, N, K", [(1024, 512, 512), (998, 111, 512), (63, 128, 512)])
453449
@pytest.mark.parametrize("BLOCK_M, BLOCK_N, BLOCK_K", [(128, 128, 128), (256, 128, 128), (128, 256, 128),
454450
(128, 128, 256), (128, 256, 256)])
455451
@pytest.mark.parametrize("NUM_STAGES", [1, 2, 4])
456452
@pytest.mark.parametrize("USE_2D_SCALE_LOAD", [False, True])
457453
@pytest.mark.skipif(torch.cuda.get_device_capability()[0] < 10, reason="Requires compute capability >= 10")
458454
def test_blocked_scale_mxfp(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, NUM_STAGES, USE_2D_SCALE_LOAD, device, monkeypatch):
459-
if NUM_STAGES == 1 and USE_2D_SCALE_LOAD:
460-
# Disabling ptxas optimization as a temporary workaround, otherwise the test does not pass
461-
_knob_disable_ptxas_opt(monkeypatch)
462-
463455
if BLOCK_N == 256 and BLOCK_K == 256:
464456
NUM_STAGES = min(NUM_STAGES, 2)
465457
elif BLOCK_K == 256:

test/Analysis/test-membar.mlir

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -967,3 +967,23 @@ tt.func @direct_backedge_within_loop(%arg0: index, %arg1: index, %arg2: index, %
967967
}
968968

969969
}
970+
971+
// -----
972+
973+
// CHECK-LABEL: tmem_copy_after_alloc
974+
#blocked = #ttg.blocked<{sizePerThread = [1, 16], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}>
975+
#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
976+
#smem = #ttg.shared_memory
977+
#tmem_scales = #ttng.tensor_memory_scales_encoding<>
978+
module attributes {"ttg.num-warps" = 4 : i32} {
979+
tt.func @tmem_copy_after_alloc(%arg0: tensor<1x2048xf8E4M3FN, #blocked>) {
980+
// CHECK: local_alloc
981+
%0 = ttg.local_alloc %arg0 {allocation.offset = 53248 : i32} : (tensor<1x2048xf8E4M3FN, #blocked>) -> !ttg.memdesc<1x2048xf8E4M3FN, #shared, #smem>
982+
// CHECK: tmem_alloc
983+
%1 = ttng.tmem_alloc {tensor_memory_col_offset = 256 : i32, tensor_memory_row_offset = 0 : i32} : () -> !ttg.memdesc<128x16xf8E4M3FN, #tmem_scales, #ttng.tensor_memory, mutable>
984+
// gpu.barrier
985+
// CHECK: tmem_copy
986+
ttng.tmem_copy %0, %1, : (!ttg.memdesc<1x2048xf8E4M3FN, #shared, #smem>, !ttg.memdesc<128x16xf8E4M3FN, #tmem_scales, #ttng.tensor_memory, mutable>) -> ()
987+
tt.return
988+
}
989+
}

0 commit comments

Comments
 (0)