[misc] polish && add reference && apply review suggestionsi && format

botbw · botbw · commit 9962ec581cc1 · 2025-09-15T19:01:59.000+08:00
diff --git a/benchmark/matmul/benchmark_matmul_sp.py b/benchmark/matmul/benchmark_matmul_sp.py
@@ -288,6 +288,8 @@ def main(
     print(f"Best config: {best_config}")
 
     if args.bench_torch_sparse is not None:
-        print(f"Torch sparse ({args.bench_torch_sparse}) TFlops: {total_flops / torch_sparse_latency * 1e-9:.3f}")
+        print(
+            f"Torch sparse ({args.bench_torch_sparse}) TFlops: {total_flops / torch_sparse_latency * 1e-9:.3f}"
+        )
 
     print(f"Reference Dense TFlops: {total_flops / ref_latency * 1e-9:.3f}")
diff --git a/src/tl_templates/cpp/half.hpp b/src/tl_templates/cpp/half.hpp
@@ -513,9 +513,7 @@ using std::true_type;
 template <typename T> struct is_float : std::is_floating_point<T> {};
 #else
 /// Conditional type.
-template <bool, typename T, typename> struct conditional {
-  typedef T type;
-};
+template <bool, typename T, typename> struct conditional { typedef T type; };
 template <typename T, typename F> struct conditional<false, T, F> {
   typedef F type;
 };
@@ -536,9 +534,7 @@ template <> struct is_float<long double> : true_type {};
 #endif
 
 /// Type traits for floating-point bits.
-template <typename T> struct bits {
-  typedef unsigned char type;
-};
+template <typename T> struct bits { typedef unsigned char type; };
 template <typename T> struct bits<const T> : bits<T> {};
 template <typename T> struct bits<volatile T> : bits<T> {};
 template <typename T> struct bits<const volatile T> : bits<T> {};
@@ -554,14 +550,10 @@ typedef std::uint_fast32_t uint32;
 typedef std::int_fast32_t int32;
 
 /// Unsigned integer of (at least) 32 bits width.
-template <> struct bits<float> {
-  typedef std::uint_least32_t type;
-};
+template <> struct bits<float> { typedef std::uint_least32_t type; };
 
 /// Unsigned integer of (at least) 64 bits width.
-template <> struct bits<double> {
-  typedef std::uint_least64_t type;
-};
+template <> struct bits<double> { typedef std::uint_least64_t type; };
 #else
 /// Unsigned integer of (at least) 16 bits width.
 typedef unsigned short uint16;
@@ -586,9 +578,7 @@ struct bits<double>
                   unsigned long, unsigned long long> {};
 #else
 /// Unsigned integer of (at least) 64 bits width.
-template <> struct bits<double> {
-  typedef unsigned long type;
-};
+template <> struct bits<double> { typedef unsigned long type; };
 #endif
 #endif
 
diff --git a/src/tl_templates/cuda/common.h b/src/tl_templates/cuda/common.h
@@ -136,9 +136,7 @@ TL_DEVICE unsigned int cast_smem_ptr_to_int(const void *const smem_ptr) {
   return smem_int;
 }
 
-template <typename T> struct normalize_atomic_type {
-  using type = T;
-};
+template <typename T> struct normalize_atomic_type { using type = T; };
 
 template <> /**
              * Map the public half_t alias to the native `half` type for atomic
diff --git a/src/tl_templates/cuda/gemm_sp_sm80.h b/src/tl_templates/cuda/gemm_sp_sm80.h
@@ -28,6 +28,8 @@ template <typename Shape> struct ShapeCheck<uint8_t, Shape> {
       (Shape::kM % 16 == 0) && (Shape::kN % 16 == 0) && (Shape::kK % 64 == 0);
 };
 
+// ref:
+// https://github.com/NVIDIA/cutlass/blob/main/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h
 template <typename T> struct DispatchInstructionShape {
   static_assert(!std::is_same_v<T, T>,
                 "Unsupported type for DispatchInstructionShape");
@@ -119,13 +121,9 @@ template <> struct DispatchType<cutlass::bfloat16_t> {
   using Type = cutlass::bfloat16_t;
 };
 
-template <> struct DispatchType<unsigned char> {
-  using Type = uint8_t;
-};
+template <> struct DispatchType<unsigned char> { using Type = uint8_t; };
 
-template <> struct DispatchType<signed char> {
-  using Type = int8_t;
-};
+template <> struct DispatchType<signed char> { using Type = int8_t; };
 
 template <typename Shape, int num_warp_m, int num_warp_n, bool trans_A,
           bool trans_B, bool clear_accum, typename A_type_raw,
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
@@ -364,8 +364,8 @@ def test_gemm_sp_sm90():
     run_gemm_sp_sm90(512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 2, 128, False, True)
 
 
-# @tilelang.testing.requires_cuda
-# @tilelang.testing.requires_cuda_compute_version(8, 0)
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(8, 0)
 def test_gemm_sp_sm80():
     run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 32, 32, 32, 0, 32)
     run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 32)
diff --git a/tilelang/language/builtin.py b/tilelang/language/builtin.py
@@ -355,4 +355,4 @@ def sync_grid():
 def cp_async_barrier_noinc(barrier_id: Union[int, PrimExpr, tir.Call]):
     """Perform a ptx async copy barrier using cp.async.mbarrier.arrive.noinc.
     """
-    return tir.call_intrin("handle", tir.op.Op.get("tl.ptx_cp_async_barrier_noinc"), barrier_id)
+    return tir.call_intrin("handle", tir.op.Op.get("tl.ptx_cp_async_barrier_noinc"), barrier_id)
diff --git a/tilelang/layout/gemm_sp.py b/tilelang/layout/gemm_sp.py
@@ -1,7 +1,7 @@
 """Wrapping Layouts."""
 # pylint: disable=invalid-name, unsupported-binary-operation
 
-from tilelang.autotuner.capture import Optional
+from typing import Optional
 import tvm
 import tilelang.language as T
 import warnings
@@ -116,7 +116,7 @@ def _make_metadata_layout_sm8x_cutlass(buffer: tvm.tir.Buffer, mma_dtype: str):
     #      https://github.com/nvidia/cutlass/blob/ad7b2f5e84fcfa124cb02b91d5bd26d238c0459e/include/cutlass/layout/matrix.h#L405
     #      https://github.com/nvidia/cutlass/blob/ad7b2f5e84fcfa124cb02b91d5bd26d238c0459e/include/cutlass/gemm/warp/mma_sparse_tensor_op.h#L172
 
-    if mma_dtype in ["float16, bfloat16"] and buffer.dtype not in ["uint16", "int16"]:
+    if mma_dtype in ["float16", "bfloat16"] and buffer.dtype not in ["uint16", "int16"]:
         raise ValueError(f"metadata should be 16 bit, got {buffer.dtype}")
 
     if mma_dtype in ["float8", "int8", "uint8"] and buffer.dtype not in ["uint32", "int32"]:
diff --git a/tilelang/utils/sparse.py b/tilelang/utils/sparse.py
@@ -62,17 +62,15 @@ def compress_sm80(A: torch.Tensor, transposed: bool) -> tuple[torch.Tensor, torc
     except ImportError as err:
         raise ImportError("SparseSemiStructuredTensor is not available in this version of PyTorch. "
                           "Please install a compatible version.") from err
-
     orig_val = SparseSemiStructuredTensor._FORCE_CUTLASS
-    SparseSemiStructuredTensor._FORCE_CUTLASS = True
-
-    if transposed is not False:
-        raise NotImplementedError("transposed flag is deprecated by pytorch")
-
-    compressed = to_sparse_semi_structured(A)
-    SparseSemiStructuredTensor._FORCE_CUTLASS = orig_val
-
-    return compressed.packed, compressed.meta
+    try:
+        SparseSemiStructuredTensor._FORCE_CUTLASS = True
+        if transposed is not False:
+            raise NotImplementedError("transposed flag is deprecated by pytorch")
+        compressed = to_sparse_semi_structured(A)
+        return compressed.packed, compressed.meta
+    finally:
+        SparseSemiStructuredTensor._FORCE_CUTLASS = orig_val
 
 
 def compress(A: torch.Tensor,

Original file line number	Diff line number	Diff line change
`@@ -136,9 +136,7 @@ TL_DEVICE unsigned int cast_smem_ptr_to_int(const void *const smem_ptr) {`
`136`	`136`	`return smem_int;`
`137`	`137`	`}`
`138`	`138`
`139`		`-template <typename T> struct normalize_atomic_type {`
`140`		`- using type = T;`
`141`		`-};`
	`139`	`+template <typename T> struct normalize_atomic_type { using type = T; };`
`142`	`140`
`143`	`141`	`template <> /**`
`144`	`142`	* Map the public half_t alias to the native `half` type for atomic