tile-ai · LeiWang1999 · Jan 23, 2025 · Jan 20, 2025 · Jan 20, 2025 · Jan 20, 2025
diff --git a/examples/dequantize_gemm/example_dequant_gemm_fine_grained.py b/examples/dequantize_gemm/example_dequant_gemm_fine_grained.py
@@ -257,7 +257,7 @@ def main(
             B_dequantize_local = T.alloc_local((warp_cols * local_size), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size), accum_dtype)
             reduced_accum_res = T.alloc_local(0, accum_dtype)
-            thread_bindings = T.thread_binding(0, threads, "threadIdx.x")
+            thread_binding = T.thread_binding(0, threads, "threadIdx.x")
             rk = T.thread_binding(0, reduce_k, "threadIdx.y")
 
             T.annotate_layout({
@@ -279,7 +279,7 @@ def main(
                 for i in T.serial(block_N * (block_K // reduce_k) // num_elems_per_byte //
                                   (threads * vec_load_qb)):
                     for v in T.vectorized(0, vec_load_qb):
-                        t = thread_bindings
+                        t = thread_binding
                         idx = i * threads * vec_load_qb * reduce_k + rk * threads * vec_load_qb + t * vec_load_qb + v
                         vkk = idx % (micro_size_k // num_elems_per_byte)
                         vjj = (idx // (micro_size_k // num_elems_per_byte)) % micro_size_y
@@ -299,7 +299,6 @@ def main(
                         A_local,
                         A_shared,
                         ki,
-                        thread_bindings=thread_bindings,
                         rk=rk,
                     )
 
@@ -308,7 +307,6 @@ def main(
                         B_local,
                         B_shared,
                         ki,
-                        thread_bindings=thread_bindings,
                         rk=rk,
                     )
 
@@ -343,7 +341,6 @@ def main(
                 mma_emitter.stmatrix(
                     C_local,
                     C_shared,
-                    thread_bindings=thread_bindings,
                 )
 
             for i, j in T.Parallel(block_M, (block_N // reduce_k)):

diff --git a/examples/gemm/README.md b/examples/gemm/README.md
@@ -339,7 +339,7 @@ def tl_matmul(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            thread_bindings = T.thread_binding(0, threads, "threadIdx.x")
+            thread_binding = T.thread_binding(0, threads, "threadIdx.x")
 
             T.annotate_layout({
                 A_shared: make_swizzle_layout(A_shared),
@@ -367,16 +367,14 @@ def tl_matmul(
                     mma_emitter.ldmatrix_a(
                         A_local,
                         A_shared,
-                        ki,
-                        thread_bindings=thread_bindings,
+                        ki
                     )
 
                     # Load B into fragment
                     mma_emitter.ldmatrix_b(
                         B_local,
                         B_shared,
-                        ki,
-                        thread_bindings=thread_bindings,
+                        ki
                     )
 
                     # Perform Matrix Multiplication
@@ -386,7 +384,6 @@ def tl_matmul(
             mma_emitter.stmatrix(
                 C_local,
                 C_shared,
-                thread_bindings=thread_bindings,
             )
 
             # Store shared into global
@@ -416,10 +413,10 @@ def tl_matmul(
    ```python
    for ki in T.serial(0, (block_K // micro_size_k)):
        # Warp-synchronous load for A
-       mma_emitter.ldmatrix_a(A_local, A_shared, ki, thread_bindings=thread_bindings)
+       mma_emitter.ldmatrix_a(A_local, A_shared, ki)
 
        # Warp-synchronous load for B
-       mma_emitter.ldmatrix_b(B_local, B_shared, ki, thread_bindings=thread_bindings)
+       mma_emitter.ldmatrix_b(B_local, B_shared, ki)
    ```
    Internally, these calls orchestrate how each thread in the warp issues the correct load instructions, performs address calculations, and stores the data into registers.
 
@@ -437,7 +434,7 @@ def tl_matmul(
 5. **Store Results via `stmatrix`**  
    Finally, you write the results from the warp-level fragments back to shared memory or global memory. This step might happen multiple times in a loop or just once at the end. The code snippet:
    ```python
-   mma_emitter.stmatrix(C_local, C_shared, thread_bindings=thread_bindings)
+   mma_emitter.stmatrix(C_local, C_shared)
    ```
    orchestrates the warp-synchronous stores, ensuring each thread places the correct fragment element into the correct location of the shared or global buffer.
 

diff --git a/examples/gemm/example_gemm_intrinsics.py b/examples/gemm/example_gemm_intrinsics.py
@@ -116,8 +116,6 @@ def main(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            thread_bindings = T.thread_binding(0, threads, "threadIdx.x")
-
             T.annotate_layout({
                 A_shared: make_swizzle_layout(A_shared),
                 B_shared: make_swizzle_layout(B_shared),
@@ -141,30 +139,16 @@ def main(
                 for ki in T.serial(0, (block_K // micro_size_k)):
 
                     # Load A into fragment
-                    mma_emitter.ldmatrix_a(
-                        A_local,
-                        A_shared,
-                        ki,
-                        thread_bindings=thread_bindings,
-                    )
+                    mma_emitter.ldmatrix_a(A_local, A_shared, ki)
 
                     # Load B into fragment
-                    mma_emitter.ldmatrix_b(
-                        B_local,
-                        B_shared,
-                        ki,
-                        thread_bindings=thread_bindings,
-                    )
+                    mma_emitter.ldmatrix_b(B_local, B_shared, ki)
 
                     # Perform Matrix Multiplication
                     mma_emitter.mma(A_local, B_local, C_local)
 
             # Perform STMatrix
-            mma_emitter.stmatrix(
-                C_local,
-                C_shared,
-                thread_bindings=thread_bindings,
-            )
+            mma_emitter.stmatrix(C_local, C_shared)
 
             # Store shared into global
             for i, j in T.Parallel(block_M, block_N):

diff --git a/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py b/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py
@@ -99,8 +99,6 @@ def main(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            thread_bindings = T.thread_binding(0, threads, "threadIdx.x")
-
             T.annotate_layout({
                 A_shared: make_swizzle_layout(A_shared),
                 B_shared: make_swizzle_layout(B_shared),
@@ -128,15 +126,13 @@ def main(
                         A_local,
                         A_shared,
                         ki,
-                        thread_bindings=thread_bindings,
                     )
 
                     # Load B into fragment
                     mfma_emitter.ldmatrix_b(
                         B_local,
                         B_shared,
                         ki,
-                        thread_bindings=thread_bindings,
                     )
 
                     # Perform Matrix Multiplication
@@ -147,7 +143,6 @@ def main(
                 mfma_emitter.stmatrix(
                     C_local,
                     C_shared,
-                    thread_bindings=thread_bindings,
                 )
 
                 # Store shared into global
@@ -162,7 +157,6 @@ def main(
                 mfma_emitter.stmatrix(
                     C_local,
                     C,
-                    thread_bindings=thread_bindings,
                     pid_m=by,
                     pid_n=bx,
                 )

diff --git a/testing/python/dynamic/test_tilelang_dynamic_symbolic.py b/testing/python/dynamic/test_tilelang_dynamic_symbolic.py
@@ -113,8 +113,6 @@ def main(
             B_local = T.alloc_local((warp_cols * local_size), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size), accum_dtype)
 
-            thread_bindings = T.thread_binding(0, threads, "threadIdx.x")
-
             T.annotate_layout({
                 A_shared: make_swizzle_layout(A_shared),
                 B_shared: make_swizzle_layout(B_shared),
@@ -142,15 +140,13 @@ def main(
                         A_local,
                         A_shared,
                         ki,
-                        thread_bindings=thread_bindings,
                     )
 
                     # Load B into fragment
                     mma_emitter.ldmatrix_b(
                         B_local,
                         B_shared,
                         ki,
-                        thread_bindings=thread_bindings,
                     )
 
                     # Perform Matrix Multiplication
@@ -160,7 +156,6 @@ def main(
             mma_emitter.stmatrix(
                 C_local,
                 C_shared,
-                thread_bindings=thread_bindings,
             )
 
             # Store shared into global

diff --git a/...n/kernel/test_tilelang_dequantize_gemm.py → ...l/test_tilelang_kernel_dequantize_gemm.py b/...n/kernel/test_tilelang_dequantize_gemm.py → ...l/test_tilelang_kernel_dequantize_gemm.py
@@ -457,7 +457,7 @@ def main(
             B_dequantize_local = T.alloc_local((warp_cols * local_size), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size), accum_dtype)
             reduced_accum_res = T.alloc_local(0, accum_dtype)
-            thread_bindings = T.thread_binding(0, threads, "threadIdx.x")
+            thread_binding = T.thread_binding(0, threads, "threadIdx.x")
             rk = T.thread_binding(0, reduce_k, "threadIdx.y")
 
             T.annotate_layout({
@@ -479,7 +479,7 @@ def main(
                 for i in T.serial(block_N * (block_K // reduce_k) // num_elems_per_byte //
                                   (threads * vec_load_qb)):
                     for v in T.vectorized(0, vec_load_qb):
-                        t = thread_bindings
+                        t = thread_binding
                         idx = i * threads * vec_load_qb * reduce_k + rk * threads * vec_load_qb + t * vec_load_qb + v
                         vkk = idx % (micro_size_k // num_elems_per_byte)
                         vjj = (idx // (micro_size_k // num_elems_per_byte)) % micro_size_y
@@ -499,7 +499,6 @@ def main(
                         A_local,
                         A_shared,
                         ki,
-                        thread_bindings=thread_bindings,
                         rk=rk,
                     )
 
@@ -508,7 +507,6 @@ def main(
                         B_local,
                         B_shared,
                         ki,
-                        thread_bindings=thread_bindings,
                         rk=rk,
                     )
 
@@ -543,7 +541,6 @@ def main(
                 mma_emitter.stmatrix(
                     C_local,
                     C_shared,
-                    thread_bindings=thread_bindings,
                 )
 
             for i, j in T.Parallel(block_M, (block_N // reduce_k)):

diff --git a/testing/python/kernel/test_tilelang_gemm.py → ...ython/kernel/test_tilelang_kernel_gemm.py b/testing/python/kernel/test_tilelang_gemm.py → ...ython/kernel/test_tilelang_kernel_gemm.py
diff --git a/...ernel/test_tilelang_gemm_mma_intrinsic.py → ...est_tilelang_kernel_gemm_mma_intrinsic.py b/...ernel/test_tilelang_gemm_mma_intrinsic.py → ...est_tilelang_kernel_gemm_mma_intrinsic.py
@@ -119,7 +119,7 @@ def main(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            thread_bindings = T.thread_binding(0, threads, "threadIdx.x")
+            thread_binding = T.thread_binding(0, threads, "threadIdx.x")
 
             T.annotate_layout({
                 A_shared: make_swizzle_layout(A_shared),
@@ -148,15 +148,13 @@ def main(
                         A_local,
                         A_shared,
                         ki,
-                        thread_bindings=thread_bindings,
                     )
 
                     # Load B into fragment
                     mma_emitter.ldmatrix_b(
                         B_local,
                         B_shared,
                         ki,
-                        thread_bindings=thread_bindings,
                     )
 
                     # Perform Matrix Multiplication
@@ -166,7 +164,6 @@ def main(
             mma_emitter.stmatrix(
                 C_local,
                 C_shared,
-                thread_bindings=thread_bindings,
             )
 
             # Store shared into global

diff --git a/.../python/kernel/test_tilelang_gemm_simt.py → .../kernel/test_tilelang_kernel_gemm_simt.py b/.../python/kernel/test_tilelang_gemm_simt.py → .../kernel/test_tilelang_kernel_gemm_simt.py
diff --git a/...n/kernel/test_tilelang_int4_mma_matmul.py → ...l/test_tilelang_kernel_int4_mma_matmul.py b/...n/kernel/test_tilelang_int4_mma_matmul.py → ...l/test_tilelang_kernel_int4_mma_matmul.py
@@ -109,7 +109,7 @@ def main(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            thread_bindings = T.thread_binding(0, threads, "threadIdx.x")
+            thread_binding = T.thread_binding(0, threads, "threadIdx.x")
 
             T.annotate_layout({
                 A_shared: make_swizzle_layout(A_shared),
@@ -138,16 +138,14 @@ def main(
                         A_local,
                         A_shared,
                         ki,
-                        thread_bindings=thread_bindings,
-                    )
+                        )
 
                     # Load B into fragment
                     mma_emitter.ldmatrix_b(
                         B_local,
                         B_shared,
                         ki,
-                        thread_bindings=thread_bindings,
-                    )
+                        )
 
                     # Perform Matrix Multiplication
                     mma_emitter.mma(A_local, B_local, C_local)
@@ -156,7 +154,6 @@ def main(
             mma_emitter.stmatrix(
                 C_local,
                 C_shared,
-                thread_bindings=thread_bindings,
             )
 
             # Store shared into global
@@ -297,7 +294,7 @@ def main(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            thread_bindings = T.thread_binding(0, threads, "threadIdx.x")
+            thread_binding = T.thread_binding(0, threads, "threadIdx.x")
 
             T.annotate_layout({
                 A_shared: make_swizzle_layout(A_shared),
@@ -328,16 +325,14 @@ def main(
                         A_local,
                         A_shared,
                         ki,
-                        thread_bindings=thread_bindings,
-                    )
+                        )
 
                     # Load B into fragment
                     mma_emitter.ldmatrix_b(
                         B_local,
                         B_shared,
                         ki,
-                        thread_bindings=thread_bindings,
-                    )
+                        )
 
                     # Perform Matrix Multiplication
                     mma_emitter.mma(A_local, B_local, C_local)
@@ -346,7 +341,6 @@ def main(
             mma_emitter.stmatrix(
                 C_local,
                 C_shared,
-                thread_bindings=thread_bindings,
             )
 
             # Store shared into global