tile-ai
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 7 additions & 5 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎3rdparty/tvm‎ b/‎3rdparty/tvm‎
diff --git a/‎examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py‎
Lines changed: 2 additions & 16 deletions b/‎examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py‎
Lines changed: 2 additions & 16 deletions
diff --git a/‎examples/convolution/example_convolution.py‎
Lines changed: 2 additions & 3 deletions b/‎examples/convolution/example_convolution.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎examples/convolution/test_example_convolution.py‎
Lines changed: 5 additions & 0 deletions b/‎examples/convolution/test_example_convolution.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/flash_attention/example_mha_bwd_wgmma_pipelined.py‎
Lines changed: 3 additions & 6 deletions b/‎examples/flash_attention/example_mha_bwd_wgmma_pipelined.py‎
Lines changed: 3 additions & 6 deletions
@@ -23,8 +23,8 @@ jobs:
     - name: Activate virtual environment and install dependencies
       run: |
         source tilelang_ci/bin/activate
-        python -m pip install --upgrade pip
-        if [ -f requirements-dev.txt ]; then python -m pip install -r requirements-dev.txt; fi
+        python -m pip install --upgrade pip --no-user
+        if [ -f requirements-dev.txt ]; then python -m pip install -r requirements-dev.txt --no-user; fi
 
     - name: Update submodules recursively
       run: git submodule update --init --recursive
@@ -55,22 +55,24 @@ jobs:
     - name: Activate virtual environment and install dependencies
       run: |
         source tilelang_ci/bin/activate
-        python -m pip install --upgrade pip
-        if [ -f requirements-test.txt ]; then PIP_NO_BUILD_ISOLATION=1 python -m pip install -r requirements-test.txt; fi
+        python -m pip install --upgrade pip --no-user
+        if [ -f requirements-test.txt ]; then PIP_NO_BUILD_ISOLATION=1 python -m pip install -r requirements-test.txt --no-user; fi
 
     - name: Install project in wheel mode
       run: |
         source tilelang_ci/bin/activate
-        python -m pip install .
+        python -m pip install .  --no-user
 
     - name: Run examples
       run: |
         source tilelang_ci/bin/activate
         cd examples
+        unset PYTHONPATH
         python -m pytest **/test*.py
 
     - name: Run tests
       run: |
         source tilelang_ci/bin/activate
         cd testing/python
+        unset PYTHONPATH
         python -m pytest
@@ -1,7 +1,6 @@
 import torch
 import torch.nn.functional as F
 import tilelang
-from tilelang.autotuner import *
 import tilelang.language as T
 from einops import rearrange, einsum
 import argparse
@@ -71,7 +70,7 @@ def flash_attn_split(
                 loop_range = (blocks_per_split + T.if_then_else(sid < remaining_blocks, 1, 0))
                 start = blocks_per_split * sid + T.min(sid, remaining_blocks)
                 has_valid_block = False
-                # if (start < num_blocks):
+
                 for k in T.Pipelined(loop_range, num_stages=num_stages):
                     i_s = block_indices[bid, cur_kv_head, start + k]
                     if i_s >= 0:
@@ -238,23 +237,12 @@ def forward(self, query, key, value, block_indices, cache_seqlens):
             size_one_kv_head,
             is_causal_or_local=True,
             max_splits=128)
-        # print("num_split: ", num_split)
-        # Function to compile
-        # def compute_actual_num_blocks(block_indices):
-        #     actual_num_blocks = torch.sum(block_indices != -1, dim=-1).to(torch.int32)
-        #     actual_num_blocks = actual_num_blocks[:, 0]  # [batch]
-        #     return actual_num_blocks
-        # compiled_fn = torch.compile(compute_actual_num_blocks)
-        # actual_num_blocks = compiled_fn(block_indices)
+
         glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device='cuda')
         output_partial = torch.empty((batch, heads, num_split, dim_v),
                                      dtype=torch.float32,
                                      device='cuda')
 
-        # output = self.kernel(
-        #     query, key, value, block_indices, cache_seqlens,
-        #     actual_num_blocks, glse, output_partial
-        # )
         output = self.kernel(query, key, value, block_indices, cache_seqlens, glse, output_partial)
         return output
 
@@ -377,8 +365,6 @@ def debug(name, expect, actual, atol=1e-3, rtol=1e-3):
     all_close = torch.allclose(expect, actual, atol=atol, rtol=rtol)
     print(name + "  all_close={}".format(all_close))
     if not all_close:
-        # print(expect[3, 28])
-        # print(actual[3, 28])
         diff = (expect - actual).abs()
         print("all_close={}, max={}, min={}, mean={}".format(all_close,
                                                              diff.max().item(),
 
@@ -116,9 +116,8 @@ def main(argv=None):
     block_k = 32
     num_stages = 3
     threads = 256
-
-    kernel = tilelang.compile(
-        convolution(N, C, H, W, F, K, S, D, P, block_m, block_n, block_k, num_stages, threads), out_idx=[2])
+    program = convolution(N, C, H, W, F, K, S, D, P, block_m, block_n, block_k, num_stages, threads)
+    kernel = tilelang.compile(program, out_idx=[2])
 
     out_c = kernel(a, b)
     ref_c = ref_program(S, P, D)(a, b)
 
@@ -4,10 +4,15 @@
 import example_convolution_autotune
 
 
+# TODO(@cy): TMA with convolution must be fixed in future.
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_le(8, 9)
 def test_example_convolution():
     example_convolution.main([])
 
 
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_le(8, 9)
 def test_example_convolution_autotune():
     example_convolution_autotune.main()
 
 
@@ -9,7 +9,7 @@
 tilelang.testing.set_random_seed(42)
 
 
-@tilelang.jit(out_idx=[2])
+@tilelang.jit
 def tl_gemm(
     M,
     N,
 
@@ -23,7 +23,6 @@ def flash_fwd(
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=128) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
-            # Q_local = T.alloc_fragment([block_M, dim], dtype)
             K_shared = T.alloc_shared([block_N, dim], dtype)
             V_shared = T.alloc_shared([block_N, dim], dtype)
             acc_s = T.alloc_fragment([block_M, block_N], accum_dtype)
@@ -40,9 +39,7 @@ def flash_fwd(
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
-            # T.copy(Q_shared, Q_local)
-            # for i, j in T.Parallel(block_M, dim):
-            #     Q_local[i, j] *= scale
+
             loop_range = (
                 T.ceildiv(
                     (bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N))
@@ -264,8 +261,8 @@ def maybe_contiguous(x):
             return x
 
         do, q, k, v, o = [maybe_contiguous(x) for x in (do, q, k, v, o)]
-        block_M = 64
-        block_N = 64 if D_HEAD <= 64 else 32
+        block_M = 128
+        block_N = 128 if D_HEAD <= 64 else 32
         mod_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD)
         mod_post = flashattn_bwd_postprocess(BATCH, H, N_CTX, D_HEAD)
         delta = mod_prep(o, do)