[Debug] Introduce T.print for buffer and variables logging on frontend (#45)

LeiWang1999 · web-flow · commit 5b48f8d7a2db · 2025-01-25T03:09:28.000+08:00
* [Doc] Update documentation structure and content: add overview section, revise project name, and change theme to Furo

* [Feature] Add device-side debug printing functions and integrate into kernel interface

* lint fix

* remove debug print

* implement test for debug

* lint fix

* add some comments

* Enhance fragment design and assert fragment print

* enhance debug print

* add test for msg

* lint fix
diff --git a/.gitignore b/.gitignore
@@ -76,3 +76,6 @@ models/frozenmodels/
 
 # build sdist
 build_sdist/
+
+# exclude debug testing folder
+!testing/python/debug
diff --git a/src/target/codegen_cuda.cc b/src/target/codegen_cuda.cc
@@ -83,6 +83,7 @@ std::string CodeGenTileLangCUDA::Finish() {
   decl_stream << "#include <tl_templates/cuda/reduce.h>\n";
   decl_stream << "#include <tl_templates/cuda/ldsm.h>\n";
   decl_stream << "#include <tl_templates/cuda/threadblock_swizzle.h>\n";
+  decl_stream << "#include <tl_templates/cuda/debug.h>\n";
   decl_stream << "\n";
   return CodeGenC::Finish();
 }
diff --git a/src/tl_templates/cuda/debug.h b/src/tl_templates/cuda/debug.h
@@ -0,0 +1,127 @@
+#pragma once
+
+#include "common.h"
+#include <stdio.h>
+
+// Template declaration for device-side debug printing (variable only)
+template <typename T> __device__ void debug_print_var(char *msg, T var);
+
+// Specialization for integer type
+template <> __device__ void debug_print_var<int>(char *msg, int var) {
+  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=int "
+         "value=%d\n",
+         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+         threadIdx.z, var);
+}
+
+// Specialization for float type
+template <> __device__ void debug_print_var<float>(char *msg, float var) {
+  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=float "
+         "value=%f\n",
+         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+         threadIdx.z, var);
+}
+
+// Specialization for half type
+template <> __device__ void debug_print_var<half>(char *msg, half var) {
+  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=half "
+         "value=%f\n",
+         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+         threadIdx.z, (float)var);
+}
+
+// Specialization for half_t type
+template <> __device__ void debug_print_var<half_t>(char *msg, half_t var) {
+  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=half_t "
+         "value=%f\n",
+         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+         threadIdx.z, (float)var);
+}
+
+// Specialization for bfloat16_t type
+template <>
+__device__ void debug_print_var<bfloat16_t>(char *msg, bfloat16_t var) {
+  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
+         "dtype=bfloat16_t value=%f\n",
+         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+         threadIdx.z, (float)var);
+}
+
+// Specialization for double type
+template <> __device__ void debug_print_var<double>(char *msg, double var) {
+  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=double "
+         "value=%lf\n",
+         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+         threadIdx.z, var);
+}
+
+#pragma once
+
+#include "common.h"
+#include <stdio.h>
+
+// Template declaration for device-side debug printing (buffer only)
+template <typename T>
+__device__ void debug_print_buffer_value(char *msg, char *buf_name, int index,
+                                         T var);
+
+// Specialization for integer type
+template <>
+__device__ void debug_print_buffer_value<int>(char *msg, char *buf_name,
+                                              int index, int var) {
+  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
+         "index=%d, dtype=int value=%d\n",
+         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+         threadIdx.z, buf_name, index, var);
+}
+
+// Specialization for float type
+template <>
+__device__ void debug_print_buffer_value<float>(char *msg, char *buf_name,
+                                                int index, float var) {
+  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
+         "index=%d, dtype=float value=%f\n",
+         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+         threadIdx.z, buf_name, index, var);
+}
+
+// Specialization for half type
+template <>
+__device__ void debug_print_buffer_value<half>(char *msg, char *buf_name,
+                                               int index, half var) {
+  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
+         "index=%d, dtype=half value=%f\n",
+         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+         threadIdx.z, buf_name, index, (float)var);
+}
+
+// Specialization for half_t type
+template <>
+__device__ void debug_print_buffer_value<half_t>(char *msg, char *buf_name,
+                                                 int index, half_t var) {
+  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
+         "index=%d, dtype=half_t value=%f\n",
+         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+         threadIdx.z, buf_name, index, (float)var);
+}
+
+// Specialization for bfloat16_t type
+template <>
+__device__ void debug_print_buffer_value<bfloat16_t>(char *msg, char *buf_name,
+                                                     int index,
+                                                     bfloat16_t var) {
+  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
+         "index=%d, dtype=bfloat16_t value=%f\n",
+         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+         threadIdx.z, buf_name, index, (float)var);
+}
+
+// Specialization for double type
+template <>
+__device__ void debug_print_buffer_value<double>(char *msg, char *buf_name,
+                                                 int index, double var) {
+  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
+         "index=%d, dtype=double value=%lf\n",
+         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+         threadIdx.z, buf_name, index, var);
+}
diff --git a/testing/python/debug/test_tilelang_debug_print.py b/testing/python/debug/test_tilelang_debug_print.py
@@ -0,0 +1,104 @@
+# type: ignore
+
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+
+
+def debug_print_buffer(M=16, N=16):
+    dtype = "float16"
+
+    @T.prim_func
+    def program(Q: T.Buffer((M, N), dtype)):
+        with T.Kernel(4, 4, 2, threads=128 * 2) as (bx, by, bz):
+            shared_buf = T.alloc_shared([M, N], dtype)
+            T.print(shared_buf)
+
+    jit_kernel = tilelang.JITKernel(program, target="cuda")
+    profiler = jit_kernel.get_profiler()
+    profiler.run_once()
+
+
+def test_debug_print_buffer():
+    debug_print_buffer(16, 16)
+
+
+def debug_print_buffer_conditional(M=16, N=16):
+    dtype = "float16"
+
+    @T.prim_func
+    def program(Q: T.Buffer((M, N), dtype)):
+        with T.Kernel(4, 4, 2, threads=128 * 2) as (bx, by, bz):
+            shared_buf = T.alloc_shared([M, N], dtype)
+
+            if bx == 0 and by == 0 and bz == 0:
+                T.print(shared_buf)
+
+    jit_kernel = tilelang.JITKernel(program, target="cuda")
+    profiler = jit_kernel.get_profiler()
+    profiler.run_once()
+
+
+def test_debug_print_buffer_conditional():
+    debug_print_buffer_conditional(16, 16)
+
+
+def debug_print_value_conditional(M=16, N=16):
+    dtype = "float16"
+
+    @T.prim_func
+    def program(Q: T.Buffer((M, N), dtype)):
+        with T.Kernel(4, 4, 2, threads=128 * 2) as (bx, by, bz):
+            tid = T.get_thread_binding()
+            if tid == 0:
+                T.print(bx + by + bz)
+
+    jit_kernel = tilelang.JITKernel(program, target="cuda")
+    profiler = jit_kernel.get_profiler()
+    profiler.run_once()
+
+
+def test_debug_print_value_conditional():
+    debug_print_value_conditional(16, 16)
+
+
+def debug_print_register_files(M=16, N=16):
+    dtype = "float16"
+
+    @T.prim_func
+    def program(Q: T.Buffer((M, N), dtype)):
+        with T.Kernel(4, 4, 2, threads=128 * 2) as (bx, by, bz):
+            shared_buf = T.alloc_fragment([M, N], dtype)
+            for i, j in T.Parallel(M, N):
+                T.print(shared_buf[i, j])
+
+    jit_kernel = tilelang.JITKernel(program, target="cuda")
+    profiler = jit_kernel.get_profiler()
+    profiler.run_once()
+
+
+def test_debug_print_register_files():
+    debug_print_register_files(16, 16)
+
+
+def debug_print_msg(M=16, N=16):
+    dtype = "float16"
+
+    @T.prim_func
+    def program(Q: T.Buffer((M, N), dtype)):
+        with T.Kernel(4, 4, 2, threads=128 * 2) as (bx, by, bz):
+            tid = T.get_thread_binding()
+            if tid == 0:
+                T.print(bx + by + bz, msg="hello world")
+
+    jit_kernel = tilelang.JITKernel(program, target="cuda")
+    profiler = jit_kernel.get_profiler()
+    profiler.run_once()
+
+
+def test_debug_print_msg():
+    debug_print_msg(16, 16)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/tilelang/language/__init__.py b/tilelang/language/__init__.py
@@ -8,7 +8,7 @@
 from tilelang.layout import Layout, Fragment  # noqa: F401
 from .parallel import Parallel  # noqa: F401
 from .pipeline import Pipelined  # noqa: F401
-from .kernel import Kernel, KernelLaunchFrame  # noqa: F401
+from .kernel import Kernel, KernelLaunchFrame, get_thread_binding  # noqa: F401
 from .allocate import (
     alloc_local,  # noqa: F401
     alloc_shared,  # noqa: F401
@@ -24,6 +24,7 @@
     reduce_sum,  # noqa: F401
     reduce_abssum,  # noqa: F401
 )
+from .print import print  # noqa: F401
 from .customize import (
     atomic_add,  # noqa: F401
     atomic_addx2,  # noqa: F401
diff --git a/tilelang/language/kernel.py b/tilelang/language/kernel.py
@@ -132,6 +132,13 @@ def get_thread_binding(self, dim: int = 0) -> Var:
         """
         return self.frames[-4 + dim].iter_var.var
 
+    def get_thread_bindings(self) -> List[Var]:
+        """
+        Returns the thread binding for the given dimension.
+        dim=0 corresponds to threadIdx.x, dim=1 to threadIdx.y, and dim=2 to threadIdx.z.
+        """
+        return [frame.iter_var.var for frame in self.frames[-4:-1]]
+
     def get_num_threads(self) -> int:
         """
         Returns the thread indices from the topmost frame.
@@ -213,3 +220,15 @@ def Kernel(
         attrs["pragma_import_c"] = prelude
 
     return _ffi_api.KernelLaunch(blocks, threads, attrs)
+
+
+def get_thread_binding(dim: int = 0) -> Var:
+    """Returns the thread binding for the given dimension.
+    """
+    return KernelLaunchFrame.Current().get_thread_binding(dim)
+
+
+def get_thread_bindings() -> List[Var]:
+    """Returns all three thread bindings.
+    """
+    return KernelLaunchFrame.Current().get_thread_bindings()
diff --git a/tilelang/language/print.py b/tilelang/language/print.py
diff --git a/tilelang/layout/fragment.py b/tilelang/layout/fragment.py

Original file line number	Diff line number	Diff line change
`@@ -83,6 +83,7 @@ std::string CodeGenTileLangCUDA::Finish() {`
`83`	`83`	`decl_stream << "#include <tl_templates/cuda/reduce.h>\n";`
`84`	`84`	`decl_stream << "#include <tl_templates/cuda/ldsm.h>\n";`
`85`	`85`	`decl_stream << "#include <tl_templates/cuda/threadblock_swizzle.h>\n";`
	`86`	`+ decl_stream << "#include <tl_templates/cuda/debug.h>\n";`
`86`	`87`	`decl_stream << "\n";`
`87`	`88`	`return CodeGenC::Finish();`
`88`	`89`	`}`