minor fix

LeiWang1999 · LeiWang1999 · commit f2f8a2852cd9 · 2025-11-18T11:44:57.000+08:00
diff --git a/examples/blocksparse_gemm/example_blocksparse_gemm.py b/examples/blocksparse_gemm/example_blocksparse_gemm.py
@@ -166,7 +166,6 @@ def main():
             enable_rasteration=DEFAULT_ENABLE_RASTERIZATION)
         block_M, block_N, block_K = DEFAULT_BLOCK_M, DEFAULT_BLOCK_N, DEFAULT_BLOCK_K
         print(f"Using default kernel with block size ({block_M}, {block_N}, {block_K})")
-    print(kernel.get_kernel_source())
     # Create block mask with desired sparsity
     mask_shape = (M // block_M, N // block_N, K // block_K)
     block_mask = torch.rand(mask_shape).cuda() > sparsity
diff --git a/examples/blocksparse_gemm/test_example_blocksparse_gemm.py b/examples/blocksparse_gemm/test_example_blocksparse_gemm.py
@@ -1,3 +1,4 @@
+import tilelang.testing
 import example_blocksparse_gemm
 
 
@@ -6,5 +7,4 @@ def test_example_blocksparse_gemm():
 
 
 if __name__ == "__main__":
-    # tilelang.testing.main()
-    test_example_blocksparse_gemm()
+    tilelang.testing.main()
diff --git a/examples/gdn/test_example_gdn_compilation.py b/examples/gdn/test_example_gdn_compilation.py
@@ -107,7 +107,6 @@ def test_example_chunk_o_compilation():
 
 
 def test_example_chunk_o_bwd_compilation():
-    tilelang.disable_cache()
     from example_chunk_o_bwd import tilelang_chunk_o_bwd_dqkwg, prepare_input
     Q, K, V, h, G, dO, dh, dv, W = prepare_input(B, S, H, DK, DV, chunk_size,
                                                  getattr(torch, input_dtype),
@@ -118,13 +117,6 @@ def test_example_chunk_o_bwd_compilation():
     kernel = tilelang_chunk_o_bwd_dqkwg(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
                                         gate_dtype, state_dtype, chunk_size, 1.0, use_g, True,
                                         block_DK, block_DV, threads, num_stages)
-    # print(kernel.get_kernel_source())
-    kernel = tilelang_chunk_o_bwd_dqkwg(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
-                                        gate_dtype, state_dtype, chunk_size, 1.0, use_g, True,
-                                        block_DK, block_DV, threads, num_stages)
-    kernel = tilelang_chunk_o_bwd_dqkwg(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
-                                        gate_dtype, state_dtype, chunk_size, 1.0, use_g, True,
-                                        block_DK, block_DV, threads, num_stages)
 
     dq_tilelang, dk_tilelang, dw_tilelang, dg_tilelang = kernel(Q, K, V, h, G, dO, dh, dv,
                                                                 W)  # noqa: F841
@@ -197,5 +189,4 @@ def test_example_chunk_delta_bwd_compilation():
 
 
 if __name__ == "__main__":
-    # tilelang.testing.main()
-    test_example_chunk_o_bwd_compilation()
+    tilelang.testing.main()
diff --git a/examples/quickstart.py b/examples/quickstart.py
@@ -5,7 +5,10 @@
 # @tilelang.jit(target="cuda")
 # target currently can be "cuda" or "hip" or "cpu".
 # if not specified, it will be inferred from the input tensors during compile time
-@tilelang.jit
+@tilelang.jit(execution_backend="tvm_ffi", pass_configs={
+    tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER:True,
+    tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+})
 def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
 
     @T.prim_func
@@ -48,7 +51,7 @@ def matmul_relu_kernel(
     return matmul_relu_kernel
 
 
-M = 1024  # M = T.dynamic("m") if you want to use dynamic shape
+M = T.dynamic("m")  # M = T.dynamic("m") if you want to use dynamic shape
 N = 1024
 K = 1024
 block_M = 128
@@ -61,6 +64,7 @@ def matmul_relu_kernel(
 # 3. Test the kernel in Python with PyTorch data
 import torch
 
+M = 0
 # Create random input tensors on the GPU
 a = torch.randn(M, K, device="cuda", dtype=torch.float16)
 b = torch.randn(K, N, device="cuda", dtype=torch.float16)
@@ -77,13 +81,3 @@ def matmul_relu_kernel(
 torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
 print("Kernel output matches PyTorch reference.")
 
-# 4. Retrieve and inspect the generated CUDA source (optional)
-# cuda_source = jit_kernel.get_kernel_source()
-# print("Generated CUDA kernel:\n", cuda_source)
-
-# 5.Profile latency with kernel
-profiler = matmul_relu_kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
-
-latency = profiler.do_bench()
-
-print(f"Latency: {latency} ms")
diff --git a/testing/python/debug/test_tilelang_debug_print.py b/testing/python/debug/test_tilelang_debug_print.py
@@ -13,7 +13,6 @@ def program(Q: T.Tensor((M, N), dtype)):
             shared_buf = T.alloc_shared([M, N], dtype)
             T.print(shared_buf)
 
-    tilelang.disable_cache()
     jit_kernel = tilelang.compile(program, target="cuda", execution_backend="tvm_ffi")
     profiler = jit_kernel.get_profiler()
     profiler.run_once()
diff --git a/testing/python/jit/test_tilelang_jit_nullptr.py b/testing/python/jit/test_tilelang_jit_nullptr.py
@@ -83,14 +83,12 @@ def main(
 
 
 def run_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-    tilelang.disable_cache()
     kernel = ptr_null_test(M, N, K, block_M, block_N, block_K, dtype, accum_dtype)
 
     a = torch.randn(M, K, device="cuda", dtype=map_torch_type(dtype))
     b = torch.randn(N, K, device="cuda", dtype=map_torch_type(dtype))
     c = torch.zeros(M, N, device="cuda", dtype=map_torch_type(accum_dtype))
     d = torch.randn(N, device="cuda", dtype=map_torch_type(accum_dtype))
-    print(kernel.get_host_source())
     kernel(a, b, c, None, M, N, K, False)
 
     ref_no_bias = (a @ b.T).to(map_torch_type(accum_dtype))
@@ -114,5 +112,4 @@ def test_nullptr():
 
 
 if __name__ == "__main__":
-    # tilelang.testing.main()
-    run_test(1024, 1024, 1024, 128, 128, 32)
+    tilelang.testing.main()
diff --git a/testing/python/jit/test_tilelang_jit_nvrtc.py b/testing/python/jit/test_tilelang_jit_nvrtc.py
@@ -364,8 +364,7 @@ def run_nvrtc_dynamic_shape(M,
         num_threads,
     )
 
-    matmul_kernel = tilelang.compile(program, execution_backend="tvm_ffi")
-    print(matmul_kernel.get_host_source())
+    matmul_kernel = tilelang.compile(program, execution_backend="nvrtc")
     if isinstance(M, T.Var):
         M = 1024
     if isinstance(N, T.Var):
@@ -583,7 +582,4 @@ def kernel(
 
 
 if __name__ == "__main__":
-    # tilelang.testing.main()
-    tilelang.disable_cache()
-    run_nvrtc_dynamic_shape(
-        T.dynamic("m"), 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_alloc.py b/testing/python/language/test_tilelang_language_alloc.py
@@ -113,7 +113,6 @@ def run_alloc_var_with_initializer(
 
     kernel = tilelang.compile(program, out_idx=[1])
     code = kernel.get_kernel_source()
-    print(code)
     assert f"= {init_value};" in code