Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 22 additions & 2 deletions mlir/test/Examples/NVGPU/Ch0.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
# RUN: %PYTHON %s | FileCheck %s
# RUN: sh -c 'if [[ "%mlir_run_cuda_sm90_tests" == "1" ]]; \
# RUN: then %PYTHON %s | FileCheck %s; \
# RUN: else export MLIR_NVDSL_PRINT_IR=1; \
# RUN: %PYTHON %s | FileCheck %s --check-prefix=DUMPIR; fi'


# ===----------------------------------------------------------------------===//
# Chapter 0 : Hello World
Expand Down Expand Up @@ -43,8 +47,24 @@ def kernel():
# 4. The `mlir_func` decorator JIT compiles the IR and executes the MLIR function.
main(alpha)


# CHECK: GPU thread 0 has 100
# CHECK: GPU thread 1 has 101
# CHECK: GPU thread 2 has 102
# CHECK: GPU thread 3 has 103

# DUMPIR: func.func @main(%arg0: index) attributes {llvm.emit_c_interface} {
# DUMPIR: %[[C1:.*]] = arith.constant 1 : index
# DUMPIR: %[[C1_0:.*]] = arith.constant 1 : index
# DUMPIR: %[[C1_1:.*]] = arith.constant 1 : index
# DUMPIR: %[[C4:.*]] = arith.constant 4 : index
# DUMPIR: %[[C1_2:.*]] = arith.constant 1 : index
# DUMPIR: %[[C1_3:.*]] = arith.constant 1 : index
# DUMPIR: %[[C0_I32:.*]] = arith.constant 0 : i32
# DUMPIR: gpu.launch blocks(%arg1, %arg2, %arg3) in (%arg7 = %[[C1]], %arg8 = %[[C1_0]], %arg9 = %[[C1_1]]) threads(%arg4, %arg5, %arg6) in (%arg10 = %[[C4]], %arg11 = %[[C1_2]], %arg12 = %[[C1_3]]) dynamic_shared_memory_size %[[C0_I32]] {
# DUMPIR: %[[TIDX:.*]] = gpu.thread_id x
# DUMPIR: %[[MYVAL:.*]] = arith.addi %arg0, %[[TIDX]] : index
# DUMPIR: gpu.printf "GPU thread %llu has %llu\0A", %[[TIDX]], %[[MYVAL]] : index, index
# DUMPIR: gpu.terminator
# DUMPIR: }
# DUMPIR: return
# DUMPIR: }
48 changes: 42 additions & 6 deletions mlir/test/Examples/NVGPU/Ch1.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
# RUN: %PYTHON %s | FileCheck %s
# RUN: sh -c 'if [[ "%mlir_run_cuda_sm90_tests" == "1" ]]; \
# RUN: then %PYTHON %s | FileCheck %s; \
# RUN: else export MLIR_NVDSL_PRINT_IR=1; \
# RUN: %PYTHON %s | FileCheck %s --check-prefix=DUMPIR; fi'


# ===----------------------------------------------------------------------===//
# Chapter 1 : 2D Saxpy
Expand Down Expand Up @@ -56,11 +60,43 @@ def saxpy_kernel():
alpha = 2.0
x = np.random.randn(M, N).astype(np.float32)
y = np.ones((M, N), np.float32)

saxpy(x, y, alpha)

# 4. Verify MLIR with reference computation
ref = np.ones((M, N), np.float32)
ref += x * alpha
np.testing.assert_allclose(y, ref, rtol=5e-03, atol=1e-01)
print("PASS")
if os.getenv("MLIR_NVDSL_PRINT_IR") != "1":
# 4. Verify MLIR with reference computation
ref = np.ones((M, N), np.float32)
ref += x * alpha
np.testing.assert_allclose(y, ref, rtol=5e-03, atol=1e-01)
print("PASS")
# CHECK-NOT: Mismatched elements
# CHECK: PASS

# DUMPIR: func.func @saxpy(%arg0: memref<256x32xf32>, %arg1: memref<256x32xf32>, %arg2: f32) attributes {llvm.emit_c_interface} {
# DUMPIR: %[[WAIT0:.*]] = gpu.wait async
# DUMPIR: %[[MEMREF:.*]], %[[ASYNC0:.*]] = gpu.alloc async [%[[WAIT0]]] () : memref<256x32xf32>
# DUMPIR: %[[MEMREF0:.*]], %[[ASYNC1:.*]] = gpu.alloc async [%[[ASYNC0]]] () : memref<256x32xf32>
# DUMPIR: %[[MEMCPY1:.*]] = gpu.memcpy async [%[[ASYNC1]]] %[[MEMREF]], %arg0 : memref<256x32xf32>, memref<256x32xf32>
# DUMPIR: %[[MEMCPY2:.*]] = gpu.memcpy async [%[[MEMCPY1]]] %[[MEMREF0]], %arg1 : memref<256x32xf32>, memref<256x32xf32>
# DUMPIR: %[[WAIT1:.*]] = gpu.wait async [%[[MEMCPY2]]]
# DUMPIR: %[[C256:.*]] = arith.constant 256 : index
# DUMPIR: %[[C1:.*]] = arith.constant 1 : index
# DUMPIR: %[[C1_2:.*]] = arith.constant 1 : index
# DUMPIR: %[[C32:.*]] = arith.constant 32 : index
# DUMPIR: %[[C1_3:.*]] = arith.constant 1 : index
# DUMPIR: %[[C1_4:.*]] = arith.constant 1 : index
# DUMPIR: %[[C0_I32:.*]] = arith.constant 0 : i32
# DUMPIR: gpu.launch blocks(%arg3, %arg4, %arg5) in (%arg9 = %[[C256]], %arg10 = %[[C1]], %arg11 = %[[C1_2]]) threads(%arg6, %arg7, %arg8) in (%arg12 = %[[C32]], %arg13 = %[[C1_3]], %arg14 = %[[C1_4]]) dynamic_shared_memory_size %[[C0_I32]] {
# DUMPIR: %[[BLOCKID:.*]] = gpu.block_id x
# DUMPIR: %[[THREADID:.*]] = gpu.thread_id x
# DUMPIR: %[[LD0:.*]] = memref.load %[[MEMREF]][%[[BLOCKID]], %[[THREADID]]] : memref<256x32xf32>
# DUMPIR: %[[LD1:.*]] = memref.load %[[MEMREF0]][%[[BLOCKID]], %[[THREADID]]] : memref<256x32xf32>
# DUMPIR: %[[MUL:.*]] = arith.mulf %[[LD0]], %arg2 : f32
# DUMPIR: %[[ADD:.*]] = arith.addf %[[LD1]], %[[MUL]] : f32
# DUMPIR: memref.store %[[ADD]], %[[MEMREF0]][%[[BLOCKID]], %[[THREADID]]] : memref<256x32xf32>
# DUMPIR: gpu.terminator
# DUMPIR: }
# DUMPIR: %[[MEMCPY3:.*]] = gpu.memcpy async [%[[WAIT1]]] %arg1, %[[MEMREF0]] : memref<256x32xf32>, memref<256x32xf32>
# DUMPIR: %[[WAIT2:.*]] = gpu.wait async [%[[MEMCPY3]]]
# DUMPIR: return
# DUMPIR: }
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That seems like a fragile test to me, can this be more targeted?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, I will update the test as part of the new PR.

82 changes: 76 additions & 6 deletions mlir/test/Examples/NVGPU/Ch2.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
# RUN: %PYTHON %s | FileCheck %s
# RUN: sh -c 'if [[ "%mlir_run_cuda_sm90_tests" == "1" ]]; \
# RUN: then %PYTHON %s | FileCheck %s; \
# RUN: else export MLIR_NVDSL_PRINT_IR=1; \
# RUN: %PYTHON %s | FileCheck %s --check-prefix=DUMPIR; fi'


# ===----------------------------------------------------------------------===//
# Chapter 2 : 2D Saxpy with TMA
Expand Down Expand Up @@ -85,9 +89,75 @@ def saxpy_tma_kernel():
y = np.ones((M, N), np.float32)
saxpy(x, y, alpha)

# 4. Verify MLIR with reference computation
ref = np.ones((M, N), np.float32)
ref += x * alpha
np.testing.assert_allclose(y, ref, rtol=5e-03, atol=1e-01)
print("PASS")
if os.getenv("MLIR_NVDSL_PRINT_IR") != "1":
# 4. Verify MLIR with reference computation
ref = np.ones((M, N), np.float32)
ref += x * alpha
np.testing.assert_allclose(y, ref, rtol=5e-03, atol=1e-01)
print("PASS")
# CHECK-NOT: Mismatched elements
# CHECK: PASS

# DUMPIR: func.func @saxpy(%arg0: memref<256x32xf32>, %arg1: memref<256x32xf32>, %arg2: f32) attributes {llvm.emit_c_interface} {
# DUMPIR: %[[WAIT0:.*]] = gpu.wait async
# DUMPIR: %[[MEMREF:.*]], %[[ASYNC0:.*]] = gpu.alloc async [%[[WAIT0]]] () : memref<256x32xf32>
# DUMPIR: %[[MEMREF0:.*]], %[[ASYNC1:.*]] = gpu.alloc async [%[[ASYNC0]]] () : memref<256x32xf32>
# DUMPIR: %[[MEMCPY1:.*]] = gpu.memcpy async [%[[ASYNC1]]] %[[MEMREF]], %arg0 : memref<256x32xf32>, memref<256x32xf32>
# DUMPIR: %[[MEMCPY2:.*]] = gpu.memcpy async [%[[MEMCPY1]]] %[[MEMREF0]], %arg1 : memref<256x32xf32>, memref<256x32xf32>
# DUMPIR: %[[WAIT1:.*]] = gpu.wait async [%[[MEMCPY2]]]
# DUMPIR: %[[CAST:.*]] = memref.cast %[[MEMREF]] : memref<256x32xf32> to memref<*xf32>
# DUMPIR: %[[C1:.*]] = arith.constant 1 : index
# DUMPIR: %[[C32:.*]] = arith.constant 32 : index
# DUMPIR: %[[TMA0:.*]] = nvgpu.tma.create.descriptor %[[CAST]] box[%[[C1]], %[[C32]]] : memref<*xf32> -> <tensor = memref<1x32xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>
# DUMPIR: %[[CAST2:.*]] = memref.cast %[[MEMREF0]] : memref<256x32xf32> to memref<*xf32>
# DUMPIR: %[[C1_3:.*]] = arith.constant 1 : index
# DUMPIR: %[[C32_4:.*]] = arith.constant 32 : index
# DUMPIR: %[[TMA1:.*]] = nvgpu.tma.create.descriptor %[[CAST2]] box[%[[C1_3]], %[[C32_4]]] : memref<*xf32> -> <tensor = memref<1x32xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>
# DUMPIR: %[[C256:.*]] = arith.constant 256 : index
# DUMPIR: %[[C1_5:.*]] = arith.constant 1 : index
# DUMPIR: %[[C1_6:.*]] = arith.constant 1 : index
# DUMPIR: %[[C32_7:.*]] = arith.constant 32 : index
# DUMPIR: %[[C1_8:.*]] = arith.constant 1 : index
# DUMPIR: %[[C1_9:.*]] = arith.constant 1 : index
# DUMPIR: %[[C256_I32:.*]] = arith.constant 256 : i32
# DUMPIR: gpu.launch blocks(%arg3, %arg4, %arg5) in (%arg9 = %[[C256]], %arg10 = %[[C1_5]], %arg11 = %[[C1_6]]) threads(%arg6, %arg7, %arg8) in (%arg12 = %[[C32_7]], %arg13 = %[[C1_8]], %arg14 = %[[C1_9]]) dynamic_shared_memory_size %[[C256_I32]] {
# DUMPIR: %[[BLOCKID:.*]] = gpu.block_id x
# DUMPIR: %[[THREADID:.*]] = gpu.thread_id x
# DUMPIR: %[[C0:.*]] = arith.constant 0 : index
# DUMPIR: %[[EQ:.*]] = arith.cmpi eq, %[[THREADID]], %[[C0]] : index
# DUMPIR: %[[MB:.*]] = nvgpu.mbarrier.create -> <memorySpace = #gpu.address_space<workgroup>>
# DUMPIR: %[[C0_10:.*]] = arith.constant 0 : index
# DUMPIR: %[[C1_11:.*]] = arith.constant 1 : index
# DUMPIR: nvgpu.mbarrier.init %[[MB]][%[[C0_10]]], %[[C1_11]], predicate = %[[EQ]] : <memorySpace = #gpu.address_space<workgroup>>
# DUMPIR: %[[DSM0:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
# DUMPIR: %[[C0_12:.*]] = arith.constant 0 : index
# DUMPIR: %[[VIEW:.*]] = memref.view %[[DSM0]][%[[C0_12]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<1x32xf32, #gpu.address_space<workgroup>>
# DUMPIR: %[[DSM1:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
# DUMPIR: %[[C128:.*]] = arith.constant 128 : index
# DUMPIR: %[[VIEW_13:.*]] = memref.view %[[DSM1]][%[[C128]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<1x32xf32, #gpu.address_space<workgroup>>
# DUMPIR: %[[C0_14:.*]] = arith.constant 0 : index
# DUMPIR: %[[C0_15:.*]] = arith.constant 0 : index
# DUMPIR: nvgpu.tma.async.load %[[TMA0]][%[[C0_15]], %[[BLOCKID]]], %[[MB]][%[[C0_14]]] to %[[VIEW]], predicate = %[[EQ]] : <tensor = memref<1x32xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<1x32xf32, #gpu.address_space<workgroup>>
# DUMPIR: %[[C0_16:.*]] = arith.constant 0 : index
# DUMPIR: %[[C0_17:.*]] = arith.constant 0 : index
# DUMPIR: nvgpu.tma.async.load %[[TMA1]][%[[C0_17]], %[[BLOCKID]]], %[[MB]][%[[C0_16]]] to %[[VIEW_13]], predicate = %[[EQ]] : <tensor = memref<1x32xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<1x32xf32, #gpu.address_space<workgroup>>
# DUMPIR: %[[C0_18:.*]] = arith.constant 0 : index
# DUMPIR: %[[C256_19:.*]] = arith.constant 256 : index
# DUMPIR: nvgpu.mbarrier.arrive.expect_tx %[[MB]][%[[C0_18]]], %[[C256_19]], predicate = %[[EQ]] : <memorySpace = #gpu.address_space<workgroup>>
# DUMPIR: %[[C0_20:.*]] = arith.constant 0 : index
# DUMPIR: %[[C10000000:.*]] = arith.constant 10000000 : index
# DUMPIR: %[[FALSE:.*]] = arith.constant false
# DUMPIR: nvgpu.mbarrier.try_wait.parity %[[MB]][%[[C0_20]]], %[[FALSE]], %[[C10000000]] : <memorySpace = #gpu.address_space<workgroup>>
# DUMPIR: %[[C0_21:.*]] = arith.constant 0 : index
# DUMPIR: %[[LD0:.*]] = memref.load %[[VIEW]][%[[C0_21]], %[[THREADID]]] : memref<1x32xf32, #gpu.address_space<workgroup>>
# DUMPIR: %[[C0_22:.*]] = arith.constant 0 : index
# DUMPIR: %[[LD1:.*]] = memref.load %[[VIEW_13]][%[[C0_22]], %[[THREADID]]] : memref<1x32xf32, #gpu.address_space<workgroup>>
# DUMPIR: %[[MUL:.*]] = arith.mulf %[[LD0]], %arg2 : f32
# DUMPIR: %[[ADD:.*]] = arith.addf %[[LD1]], %[[MUL]] : f32
# DUMPIR: memref.store %[[ADD]], %[[MEMREF0]][%[[BLOCKID]], %[[THREADID]]] : memref<256x32xf32>
# DUMPIR: gpu.terminator
# DUMPIR: }
# DUMPIR: %[[MEMCPY3:.*]] = gpu.memcpy async [%[[WAIT1]]] %arg1, %[[MEMREF0]] : memref<256x32xf32>, memref<256x32xf32>
# DUMPIR: %[[WAIT2:.*]] = gpu.wait async [%[[MEMCPY3]]]
# DUMPIR: return
# DUMPIR: }
Loading
Loading