Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
6745eac
[Refactor] Update FFI type handling and simplify argument management
LeiWang1999 Nov 14, 2025
b09d6d7
[Update] Sync TVM submodule and enhance kernel source handling
LeiWang1999 Nov 14, 2025
dacab10
[Refactor] Clean up imports and improve code formatting
LeiWang1999 Nov 14, 2025
ca91aa8
Merge branch 'main' of https://github.com/tile-ai/tilelang into ffi_1114
LeiWang1999 Nov 15, 2025
c6812f1
Update execution backend options and improve resolution logic
LeiWang1999 Nov 15, 2025
d09b267
lint fix
LeiWang1999 Nov 15, 2025
4a6e99b
fix
LeiWang1999 Nov 16, 2025
1ee9132
Enhance argument handling in CUDA and HIP runtime modules
LeiWang1999 Nov 16, 2025
4364065
lint fix
LeiWang1999 Nov 16, 2025
a555165
lint fix
LeiWang1999 Nov 16, 2025
2f0bf8d
lint fix
LeiWang1999 Nov 16, 2025
8f59fa1
lint fix
LeiWang1999 Nov 16, 2025
0df9ba4
minor fix
LeiWang1999 Nov 17, 2025
e202d38
fix
LeiWang1999 Nov 17, 2025
72fff54
recover check
LeiWang1999 Nov 17, 2025
3b30b19
Refactor argument binding and validation in `arg_binder.cc`
LeiWang1999 Nov 17, 2025
f85bb8f
lint fix
LeiWang1999 Nov 17, 2025
65448b5
stride fix
LeiWang1999 Nov 17, 2025
f2f8a28
minor fix
LeiWang1999 Nov 18, 2025
0794c29
fix
LeiWang1999 Nov 18, 2025
2b3bd54
lint fix
LeiWang1999 Nov 18, 2025
7fdfa12
lint fix
LeiWang1999 Nov 18, 2025
f177025
Add CUDA stream access policy window helpers and integrate with L2 pe…
LeiWang1999 Nov 18, 2025
0694198
check with symbolic
LeiWang1999 Nov 18, 2025
f13aecd
support null ptr
LeiWang1999 Nov 18, 2025
7165251
Update CMakeLists and lower.py for code generation and subproject status
LeiWang1999 Nov 18, 2025
2e6f110
lint fix
LeiWang1999 Nov 18, 2025
8d748ba
Merge branch 'main' of https://github.com/tile-ai/tilelang into ffi_1114
LeiWang1999 Nov 18, 2025
7c0b8cd
Update comments for clarity in quickstart.py
LeiWang1999 Nov 18, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion 3rdparty/tvm
Submodule tvm updated from 093b2c to f4105f
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ file(GLOB TILE_LANG_SRCS
src/transform/*.cc
src/op/*.cc
src/target/utils.cc
src/target/codegen_c_host.cc
src/target/codegen_cpp.cc
src/target/rt_mod_cpp.cc
# intrin_rule doesn't have system dependency
Expand Down
1 change: 0 additions & 1 deletion examples/blocksparse_gemm/example_blocksparse_gemm.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,6 @@ def main():
enable_rasteration=DEFAULT_ENABLE_RASTERIZATION)
block_M, block_N, block_K = DEFAULT_BLOCK_M, DEFAULT_BLOCK_N, DEFAULT_BLOCK_K
print(f"Using default kernel with block size ({block_M}, {block_N}, {block_K})")

# Create block mask with desired sparsity
mask_shape = (M // block_M, N // block_N, K // block_K)
block_mask = torch.rand(mask_shape).cuda() > sparsity
Expand Down
1 change: 0 additions & 1 deletion examples/gdn/example_chunk_o_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,7 +468,6 @@ def run_test(
kernel = tilelang_chunk_o_bwd_dqkwg(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
gate_dtype, state_dtype, chunk_size, scale, use_g, use_dw,
block_DK, block_DV, threads, num_stages)
print(kernel.get_kernel_source())
dq_tilelang, dk_tilelang, dw_tilelang, dg_tilelang = kernel(Q, K, V, h, G, dO, dh, dv, W)

if use_g:
Expand Down
1 change: 1 addition & 0 deletions examples/gdn/test_example_gdn_compilation.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def test_example_chunk_o_bwd_compilation():
kernel = tilelang_chunk_o_bwd_dqkwg(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
gate_dtype, state_dtype, chunk_size, 1.0, use_g, True,
block_DK, block_DV, threads, num_stages)

dq_tilelang, dk_tilelang, dw_tilelang, dg_tilelang = kernel(Q, K, V, h, G, dO, dh, dv,
W) # noqa: F841
if use_g:
Expand Down
5 changes: 2 additions & 3 deletions examples/quickstart.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,9 @@ def matmul_relu_kernel(
block_N = 128
block_K = 32

# 1. Define the kernel (matmul) and compile/lower it into an executable module
# Define the kernel (matmul) and compile/lower it into an executable module
matmul_relu_kernel = matmul(M, N, K, block_M, block_N, block_K)

# 3. Test the kernel in Python with PyTorch data
# Test the kernel in Python with PyTorch data
import torch

# Create random input tensors on the GPU
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ tilelang = "tilelang"
# TVM
"tilelang/3rdparty/tvm/src" = "3rdparty/tvm/src"
"tilelang/3rdparty/tvm/python" = "3rdparty/tvm/python"
"tilelang/3rdparty/tvm/include" = "3rdparty/tvm/include"
"tilelang/3rdparty/tvm/version.py" = "3rdparty/tvm/version.py"
# CUTLASS
"tilelang/3rdparty/cutlass/include" = "3rdparty/cutlass/include"
Expand Down
172 changes: 158 additions & 14 deletions src/runtime/runtime.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@
namespace tvm {
namespace tl {

#if 1
// Thread-local storage for restoring the L2 persisting cache limit
static thread_local size_t __tl_prev_persisting_l2_cache_size = 0;
static thread_local bool __tl_prev_persisting_l2_cache_saved = false;
#endif

#if (CUDA_MAJOR_VERSION >= 12)
template <typename T> static std::string ArrayToStr(const T *ptr, size_t n) {
std::stringstream ss;
Expand Down Expand Up @@ -91,19 +97,21 @@ struct TensorMapArgs {
// set device api
TVM_FFI_STATIC_INIT_BLOCK() {
namespace refl = tvm::ffi::reflection;
refl::GlobalDef().def_packed("tvm_tensormap_create_tiled", [](PackedArgs args,
Any *ret) {
TensorMapArgs T = TensorMapArgs::Extract(args);
CUresult result = cuTensorMapEncodeTiled(
T.map, T.type, T.tensorRank, T.globalAddress, T.globalDim,
T.globalStride + 1, T.boxDim, T.elementStrides, T.interleave, T.swizzle,
T.l2Promotion, T.oobFill);
if (result != CUDA_SUCCESS) {
LOG_FATAL << "Failed to initialize the TMA descriptor " << result << '\n'
<< T.ToDebugString();
}
*ret = static_cast<int>(result);
});
// Register using the canonical names defined in runtime.h
refl::GlobalDef().def_packed(
tl::tvm_tensormap_create_tiled, [](PackedArgs args, Any *ret) {
TensorMapArgs T = TensorMapArgs::Extract(args);
CUresult result = cuTensorMapEncodeTiled(
T.map, T.type, T.tensorRank, T.globalAddress, T.globalDim,
T.globalStride + 1, T.boxDim, T.elementStrides, T.interleave,
T.swizzle, T.l2Promotion, T.oobFill);
if (result != CUDA_SUCCESS) {
LOG_FATAL << "Failed to initialize the TMA descriptor " << result
<< '\n'
<< T.ToDebugString();
}
*ret = static_cast<int>(result);
});
}

struct TensorMapIm2ColArgs {
Expand Down Expand Up @@ -183,7 +191,7 @@ struct TensorMapIm2ColArgs {
TVM_FFI_STATIC_INIT_BLOCK() {
namespace refl = tvm::ffi::reflection;
refl::GlobalDef().def_packed(
"tvm_tensormap_create_im2col", [](PackedArgs args, Any *ret) {
tl::tvm_tensormap_create_im2col, [](PackedArgs args, Any *ret) {
TensorMapIm2ColArgs T = TensorMapIm2ColArgs::Extract(args);
CUresult result = cuTensorMapEncodeIm2col(
T.map, T.type, T.tensorRank, T.globalAddress, T.globalDim,
Expand All @@ -201,5 +209,141 @@ TVM_FFI_STATIC_INIT_BLOCK() {

#endif // (CUDA_MAJOR_VERSION >= 12)

//
// CUDA L2 Persisting Cache Access Policy Window helpers.
// Exposed as TVM FFI packed functions similar to TMA initialization.
//
TVM_FFI_STATIC_INIT_BLOCK() {
namespace refl = tvm::ffi::reflection;
// Set stream access policy window and adjust persisting L2 cache size
// Args:
// [0]: void* base_ptr (required)
// [1]: int64 num_bytes (required)
// [2]: float hit_ratio (optional, default 0.8)
// [3]: void* stream (optional, default 0 => default stream)
// [4]: int64 l2_limit_bytes (optional, default = num_bytes)
refl::GlobalDef().def_packed(
tl::tvm_cuda_stream_set_access_policy_window,
[](PackedArgs args, Any *ret) {
ICHECK(args.size() >= 2) << "Expected at least base_ptr and num_bytes";

void *base_ptr = args[0].cast<void *>();
size_t num_bytes = static_cast<size_t>(args[1].cast<int64_t>());
float hit_ratio = 0.8f;
if (args.size() >= 3) {
// Accept double/float
hit_ratio = static_cast<float>(args[2].cast<double>());
}
CUstream stream = nullptr;
if (args.size() >= 4) {
stream = reinterpret_cast<CUstream>(args[3].cast<void *>());
}
size_t l2_limit_bytes = num_bytes;
if (args.size() >= 5) {
l2_limit_bytes = static_cast<size_t>(args[4].cast<int64_t>());
}

// Clamp requested limit to device capability
CUdevice device;
CUresult result = cuCtxGetDevice(&device);
if (result != CUDA_SUCCESS) {
LOG_FATAL << "Failed to get current CUDA device: " << result;
}
int max_persisting = 0;
result = cuDeviceGetAttribute(
&max_persisting, CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE,
device);
if (result != CUDA_SUCCESS) {
LOG_FATAL << "Failed to query MAX_PERSISTING_L2_CACHE_SIZE: "
<< result;
}
if (max_persisting > 0 &&
l2_limit_bytes > static_cast<size_t>(max_persisting)) {
l2_limit_bytes = static_cast<size_t>(max_persisting);
}

// Save current limit to restore later
size_t init_persisting_l2_cache_size = 0;
result = cuCtxGetLimit(&init_persisting_l2_cache_size,
CU_LIMIT_PERSISTING_L2_CACHE_SIZE);
if (result != CUDA_SUCCESS) {
LOG_FATAL << "Failed to get current persisting L2 cache size limit: "
<< result;
}
__tl_prev_persisting_l2_cache_size = init_persisting_l2_cache_size;
__tl_prev_persisting_l2_cache_saved = true;

// Set new limit
result =
cuCtxSetLimit(CU_LIMIT_PERSISTING_L2_CACHE_SIZE, l2_limit_bytes);
if (result != CUDA_SUCCESS) {
LOG_FATAL << "Failed to set persisting L2 cache size limit: "
<< result;
}

// Apply access policy window to stream
CUstreamAttrValue stream_attribute;
memset(&stream_attribute, 0, sizeof(stream_attribute));
stream_attribute.accessPolicyWindow.base_ptr = base_ptr;
stream_attribute.accessPolicyWindow.num_bytes = l2_limit_bytes;
stream_attribute.accessPolicyWindow.hitRatio = hit_ratio;
stream_attribute.accessPolicyWindow.hitProp =
CU_ACCESS_PROPERTY_PERSISTING;
stream_attribute.accessPolicyWindow.missProp =
CU_ACCESS_PROPERTY_STREAMING;

result = cuStreamSetAttribute(stream,
CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW,
&stream_attribute);
if (result != CUDA_SUCCESS) {
LOG_FATAL << "Failed to set stream access policy window: " << result;
}

*ret = static_cast<int>(result);
});

// Reset stream access policy window and restore the previous L2 cache size
// Args:
// [0]: void* stream (optional, default 0)
refl::GlobalDef().def_packed(
tl::tvm_cuda_stream_reset_access_policy_window,
[](PackedArgs args, Any *ret) {
CUstream stream = nullptr;
if (args.size() >= 1) {
stream = reinterpret_cast<CUstream>(args[0].cast<void *>());
}

CUstreamAttrValue stream_attribute;
memset(&stream_attribute, 0, sizeof(stream_attribute));
// num_bytes = 0 disables the access policy window on the stream
stream_attribute.accessPolicyWindow.num_bytes = 0;

CUresult result = cuStreamSetAttribute(
stream, CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW,
&stream_attribute);
if (result != CUDA_SUCCESS) {
LOG_FATAL << "Failed to reset stream access policy window: "
<< result;
}

result = cuCtxResetPersistingL2Cache();
if (result != CUDA_SUCCESS) {
LOG_FATAL << "Failed to reset persisting L2 cache lines: " << result;
}

if (__tl_prev_persisting_l2_cache_saved) {
result = cuCtxSetLimit(CU_LIMIT_PERSISTING_L2_CACHE_SIZE,
__tl_prev_persisting_l2_cache_size);
if (result != CUDA_SUCCESS) {
LOG_FATAL << "Failed to restore persisting L2 cache size limit: "
<< result;
}
__tl_prev_persisting_l2_cache_saved = false;
}

*ret = static_cast<int>(result);
});
}

} // namespace tl
} // namespace tvm
8 changes: 7 additions & 1 deletion src/runtime/runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,13 @@ constexpr const char *tvm_tensormap_create_tiled =
constexpr const char *tvm_tensormap_create_im2col =
"__tvm_tensormap_create_im2col";
#endif // (CUDA_MAJOR_VERSION >= 12)

// CUDA stream access policy window helpers
constexpr const char *tvm_cuda_stream_set_access_policy_window =
"__tvm_cuda_stream_set_access_policy_window";
constexpr const char *tvm_cuda_stream_reset_access_policy_window =
"__tvm_cuda_stream_reset_access_policy_window";
Comment on lines +20 to +24
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

Verify CUDA version requirements for stream access policy window APIs.

The new constants tvm_cuda_stream_set_access_policy_window and tvm_cuda_stream_reset_access_policy_window are declared outside the #if (CUDA_MAJOR_VERSION >= 12) guard (lines 13-18), unlike the TMA descriptor constants above them. Ensure this is intentional and that the CUDA stream access policy window APIs (cuStreamSetAttribute with CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW, cuCtxSetLimit, cuCtxResetPersistingL2Cache) are available in CUDA versions prior to 12.


🌐 Web query:

CUDA stream access policy window cuStreamSetAttribute minimum CUDA version

💡 Result:

The stream "access policy window" (cudaStreamAttributeAccessPolicyWindow / accessPolicyWindow used with cudaStreamSetAttribute / cuLaunchAttribute) was introduced in CUDA 11.0 (documented in the CUDA 11 features announcement and in the CUDA Programming Guide). [1][2]

Sources:

  • NVIDIA blog "CUDA 11 Features Revealed". [1]
  • NVIDIA CUDA Programming Guide (examples showing cudaStreamSetAttribute accessPolicyWindow; present in CUDA 11.x documentation). [2]

Add CUDA version guard for stream access policy window constants.

The stream access policy window APIs were introduced in CUDA 11.0, but the constants tvm_cuda_stream_set_access_policy_window and tvm_cuda_stream_reset_access_policy_window (lines 20–24) lack any version guard. They should be wrapped with #if (CUDA_MAJOR_VERSION >= 11) to prevent use in earlier CUDA versions where these APIs are unavailable.

🤖 Prompt for AI Agents
In src/runtime/runtime.h around lines 20 to 24, the CUDA stream access policy
window constant definitions are unguarded and may be referenced on CUDA versions
older than 11; wrap the two constants tvm_cuda_stream_set_access_policy_window
and tvm_cuda_stream_reset_access_policy_window inside a compile-time guard such
as #if (CUDA_MAJOR_VERSION >= 11) ... #endif so they are only defined when
building against CUDA 11.0+; ensure the guard exactly encloses the constexpr
declarations and retains existing formatting and indentation.

} // namespace tl
} // namespace tvm

#endif // TVM_TL_RUNTIME_RUNTIME_H_
#endif // TVM_TL_RUNTIME_RUNTIME_H_
Loading
Loading