tile-ai · LeiWang1999 · Nov 18, 2025 · Nov 14, 2025 · Nov 14, 2025 · Nov 14, 2025
diff --git a/3rdparty/tvm b/3rdparty/tvm
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -138,6 +138,7 @@ file(GLOB TILE_LANG_SRCS
   src/transform/*.cc
   src/op/*.cc
   src/target/utils.cc
+  src/target/codegen_c_host.cc
   src/target/codegen_cpp.cc
   src/target/rt_mod_cpp.cc
   # intrin_rule doesn't have system dependency

diff --git a/examples/blocksparse_gemm/example_blocksparse_gemm.py b/examples/blocksparse_gemm/example_blocksparse_gemm.py
@@ -166,7 +166,6 @@ def main():
             enable_rasteration=DEFAULT_ENABLE_RASTERIZATION)
         block_M, block_N, block_K = DEFAULT_BLOCK_M, DEFAULT_BLOCK_N, DEFAULT_BLOCK_K
         print(f"Using default kernel with block size ({block_M}, {block_N}, {block_K})")
-
     # Create block mask with desired sparsity
     mask_shape = (M // block_M, N // block_N, K // block_K)
     block_mask = torch.rand(mask_shape).cuda() > sparsity

diff --git a/examples/gdn/example_chunk_o_bwd.py b/examples/gdn/example_chunk_o_bwd.py
@@ -468,7 +468,6 @@ def run_test(
     kernel = tilelang_chunk_o_bwd_dqkwg(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
                                         gate_dtype, state_dtype, chunk_size, scale, use_g, use_dw,
                                         block_DK, block_DV, threads, num_stages)
-    print(kernel.get_kernel_source())
     dq_tilelang, dk_tilelang, dw_tilelang, dg_tilelang = kernel(Q, K, V, h, G, dO, dh, dv, W)
 
     if use_g:

diff --git a/examples/gdn/test_example_gdn_compilation.py b/examples/gdn/test_example_gdn_compilation.py
@@ -117,6 +117,7 @@ def test_example_chunk_o_bwd_compilation():
     kernel = tilelang_chunk_o_bwd_dqkwg(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
                                         gate_dtype, state_dtype, chunk_size, 1.0, use_g, True,
                                         block_DK, block_DV, threads, num_stages)
+
     dq_tilelang, dk_tilelang, dw_tilelang, dg_tilelang = kernel(Q, K, V, h, G, dO, dh, dv,
                                                                 W)  # noqa: F841
     if use_g:

diff --git a/examples/quickstart.py b/examples/quickstart.py
@@ -55,10 +55,9 @@ def matmul_relu_kernel(
 block_N = 128
 block_K = 32
 
-# 1. Define the kernel (matmul) and compile/lower it into an executable module
+# Define the kernel (matmul) and compile/lower it into an executable module
 matmul_relu_kernel = matmul(M, N, K, block_M, block_N, block_K)
-
-# 3. Test the kernel in Python with PyTorch data
+# Test the kernel in Python with PyTorch data
 import torch
 
 # Create random input tensors on the GPU

diff --git a/pyproject.toml b/pyproject.toml
@@ -104,6 +104,7 @@ tilelang = "tilelang"
 # TVM
 "tilelang/3rdparty/tvm/src" = "3rdparty/tvm/src"
 "tilelang/3rdparty/tvm/python" = "3rdparty/tvm/python"
+"tilelang/3rdparty/tvm/include" = "3rdparty/tvm/include"
 "tilelang/3rdparty/tvm/version.py" = "3rdparty/tvm/version.py"
 # CUTLASS
 "tilelang/3rdparty/cutlass/include" = "3rdparty/cutlass/include"

diff --git a/src/runtime/runtime.cc b/src/runtime/runtime.cc
@@ -13,6 +13,12 @@
 namespace tvm {
 namespace tl {
 
+#if 1
+// Thread-local storage for restoring the L2 persisting cache limit
+static thread_local size_t __tl_prev_persisting_l2_cache_size = 0;
+static thread_local bool __tl_prev_persisting_l2_cache_saved = false;
+#endif
+
 #if (CUDA_MAJOR_VERSION >= 12)
 template <typename T> static std::string ArrayToStr(const T *ptr, size_t n) {
   std::stringstream ss;
@@ -91,19 +97,21 @@ struct TensorMapArgs {
 // set device api
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def_packed("tvm_tensormap_create_tiled", [](PackedArgs args,
-                                                                Any *ret) {
-    TensorMapArgs T = TensorMapArgs::Extract(args);
-    CUresult result = cuTensorMapEncodeTiled(
-        T.map, T.type, T.tensorRank, T.globalAddress, T.globalDim,
-        T.globalStride + 1, T.boxDim, T.elementStrides, T.interleave, T.swizzle,
-        T.l2Promotion, T.oobFill);
-    if (result != CUDA_SUCCESS) {
-      LOG_FATAL << "Failed to initialize the TMA descriptor " << result << '\n'
-                << T.ToDebugString();
-    }
-    *ret = static_cast<int>(result);
-  });
+  // Register using the canonical names defined in runtime.h
+  refl::GlobalDef().def_packed(
+      tl::tvm_tensormap_create_tiled, [](PackedArgs args, Any *ret) {
+        TensorMapArgs T = TensorMapArgs::Extract(args);
+        CUresult result = cuTensorMapEncodeTiled(
+            T.map, T.type, T.tensorRank, T.globalAddress, T.globalDim,
+            T.globalStride + 1, T.boxDim, T.elementStrides, T.interleave,
+            T.swizzle, T.l2Promotion, T.oobFill);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to initialize the TMA descriptor " << result
+                    << '\n'
+                    << T.ToDebugString();
+        }
+        *ret = static_cast<int>(result);
+      });
 }
 
 struct TensorMapIm2ColArgs {
@@ -183,7 +191,7 @@ struct TensorMapIm2ColArgs {
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def_packed(
-      "tvm_tensormap_create_im2col", [](PackedArgs args, Any *ret) {
+      tl::tvm_tensormap_create_im2col, [](PackedArgs args, Any *ret) {
         TensorMapIm2ColArgs T = TensorMapIm2ColArgs::Extract(args);
         CUresult result = cuTensorMapEncodeIm2col(
             T.map, T.type, T.tensorRank, T.globalAddress, T.globalDim,
@@ -201,5 +209,141 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 #endif // (CUDA_MAJOR_VERSION >= 12)
 
+//
+// CUDA L2 Persisting Cache Access Policy Window helpers.
+// Exposed as TVM FFI packed functions similar to TMA initialization.
+//
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  // Set stream access policy window and adjust persisting L2 cache size
+  // Args:
+  //  [0]: void* base_ptr (required)
+  //  [1]: int64 num_bytes (required)
+  //  [2]: float hit_ratio (optional, default 0.8)
+  //  [3]: void* stream (optional, default 0 => default stream)
+  //  [4]: int64 l2_limit_bytes (optional, default = num_bytes)
+  refl::GlobalDef().def_packed(
+      tl::tvm_cuda_stream_set_access_policy_window,
+      [](PackedArgs args, Any *ret) {
+        ICHECK(args.size() >= 2) << "Expected at least base_ptr and num_bytes";
+
+        void *base_ptr = args[0].cast<void *>();
+        size_t num_bytes = static_cast<size_t>(args[1].cast<int64_t>());
+        float hit_ratio = 0.8f;
+        if (args.size() >= 3) {
+          // Accept double/float
+          hit_ratio = static_cast<float>(args[2].cast<double>());
+        }
+        CUstream stream = nullptr;
+        if (args.size() >= 4) {
+          stream = reinterpret_cast<CUstream>(args[3].cast<void *>());
+        }
+        size_t l2_limit_bytes = num_bytes;
+        if (args.size() >= 5) {
+          l2_limit_bytes = static_cast<size_t>(args[4].cast<int64_t>());
+        }
+
+        // Clamp requested limit to device capability
+        CUdevice device;
+        CUresult result = cuCtxGetDevice(&device);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to get current CUDA device: " << result;
+        }
+        int max_persisting = 0;
+        result = cuDeviceGetAttribute(
+            &max_persisting, CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE,
+            device);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to query MAX_PERSISTING_L2_CACHE_SIZE: "
+                    << result;
+        }
+        if (max_persisting > 0 &&
+            l2_limit_bytes > static_cast<size_t>(max_persisting)) {
+          l2_limit_bytes = static_cast<size_t>(max_persisting);
+        }
+
+        // Save current limit to restore later
+        size_t init_persisting_l2_cache_size = 0;
+        result = cuCtxGetLimit(&init_persisting_l2_cache_size,
+                               CU_LIMIT_PERSISTING_L2_CACHE_SIZE);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to get current persisting L2 cache size limit: "
+                    << result;
+        }
+        __tl_prev_persisting_l2_cache_size = init_persisting_l2_cache_size;
+        __tl_prev_persisting_l2_cache_saved = true;
+
+        // Set new limit
+        result =
+            cuCtxSetLimit(CU_LIMIT_PERSISTING_L2_CACHE_SIZE, l2_limit_bytes);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to set persisting L2 cache size limit: "
+                    << result;
+        }
+
+        // Apply access policy window to stream
+        CUstreamAttrValue stream_attribute;
+        memset(&stream_attribute, 0, sizeof(stream_attribute));
+        stream_attribute.accessPolicyWindow.base_ptr = base_ptr;
+        stream_attribute.accessPolicyWindow.num_bytes = l2_limit_bytes;
+        stream_attribute.accessPolicyWindow.hitRatio = hit_ratio;
+        stream_attribute.accessPolicyWindow.hitProp =
+            CU_ACCESS_PROPERTY_PERSISTING;
+        stream_attribute.accessPolicyWindow.missProp =
+            CU_ACCESS_PROPERTY_STREAMING;
+
+        result = cuStreamSetAttribute(stream,
+                                      CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW,
+                                      &stream_attribute);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to set stream access policy window: " << result;
+        }
+
+        *ret = static_cast<int>(result);
+      });
+
+  // Reset stream access policy window and restore the previous L2 cache size
+  // Args:
+  //  [0]: void* stream (optional, default 0)
+  refl::GlobalDef().def_packed(
+      tl::tvm_cuda_stream_reset_access_policy_window,
+      [](PackedArgs args, Any *ret) {
+        CUstream stream = nullptr;
+        if (args.size() >= 1) {
+          stream = reinterpret_cast<CUstream>(args[0].cast<void *>());
+        }
+
+        CUstreamAttrValue stream_attribute;
+        memset(&stream_attribute, 0, sizeof(stream_attribute));
+        // num_bytes = 0 disables the access policy window on the stream
+        stream_attribute.accessPolicyWindow.num_bytes = 0;
+
+        CUresult result = cuStreamSetAttribute(
+            stream, CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW,
+            &stream_attribute);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to reset stream access policy window: "
+                    << result;
+        }
+
+        result = cuCtxResetPersistingL2Cache();
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to reset persisting L2 cache lines: " << result;
+        }
+
+        if (__tl_prev_persisting_l2_cache_saved) {
+          result = cuCtxSetLimit(CU_LIMIT_PERSISTING_L2_CACHE_SIZE,
+                                 __tl_prev_persisting_l2_cache_size);
+          if (result != CUDA_SUCCESS) {
+            LOG_FATAL << "Failed to restore persisting L2 cache size limit: "
+                      << result;
+          }
+          __tl_prev_persisting_l2_cache_saved = false;
+        }
+
+        *ret = static_cast<int>(result);
+      });
+}
+
 } // namespace tl
 } // namespace tvm
diff --git a/src/runtime/runtime.h b/src/runtime/runtime.h
@@ -16,7 +16,13 @@ constexpr const char *tvm_tensormap_create_tiled =
 constexpr const char *tvm_tensormap_create_im2col =
     "__tvm_tensormap_create_im2col";
 #endif // (CUDA_MAJOR_VERSION >= 12)
+
+// CUDA stream access policy window helpers
+constexpr const char *tvm_cuda_stream_set_access_policy_window =
+    "__tvm_cuda_stream_set_access_policy_window";
+constexpr const char *tvm_cuda_stream_reset_access_policy_window =
+    "__tvm_cuda_stream_reset_access_policy_window";
 } // namespace tl
 } // namespace tvm
 
-#endif //  TVM_TL_RUNTIME_RUNTIME_H_
+#endif //  TVM_TL_RUNTIME_RUNTIME_H_