tile-ai · LeiWang1999 · May 17, 2025 · May 14, 2025 · May 14, 2025 · May 15, 2025
diff --git a/examples/deepseek_mla/test_example_mla_decode.py b/examples/deepseek_mla/test_example_mla_decode.py
@@ -8,6 +8,7 @@
 
 
 @tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_mla_decode():
     with mock.patch.object(sys, 'argv', ["example_mla_decode.py"]):
         example_mla_decode.main()

diff --git a/src/tl_templates/cuda/gemm_sm80.h b/src/tl_templates/cuda/gemm_sm80.h
@@ -98,7 +98,8 @@ struct OperandTraits<16, N, K, false, num_warp_n,
       Swizzle<2, 3, 3>{}, Layout<Shape<_32, _8>, Stride<_1, _32>>{}));
   using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{},
                                         Step<_2, _1>{}));
-  using Copy = SM75_U16x8_LDSM_T;
+  using Copy = typename std::conditional<N == 8 * num_warp_n, SM75_U16x4_LDSM_N,
+                                         SM75_U16x8_LDSM_N>::type;
 };
 
 template <int N, int K, int num_warp_n>
@@ -108,7 +109,8 @@ struct OperandTraits<16, N, K, false, num_warp_n,
       Swizzle<3, 3, 3>{}, Layout<Shape<_64, _8>, Stride<_1, _64>>{}));
   using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{},
                                         Step<_2, _1>{}));
-  using Copy = SM75_U16x8_LDSM_T;
+  using Copy = typename std::conditional<N == 8 * num_warp_n, SM75_U16x4_LDSM_N,
+                                         SM75_U16x8_LDSM_N>::type;
 };
 
 template <int N, int K, int num_warp_n>

diff --git a/src/tl_templates/cuda/gemm_sm89.h b/src/tl_templates/cuda/gemm_sm89.h
@@ -201,7 +201,8 @@ struct OperandTraits<16, N, K, false, num_warp_n,
       Swizzle<2, 3, 3>{}, Layout<Shape<_32, _8>, Stride<_1, _32>>{}));
   using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{},
                                         Step<_2, _1>{}));
-  using Copy = SM75_U16x8_LDSM_T;
+  using Copy = typename std::conditional<N == 8 * num_warp_n, SM75_U16x4_LDSM_N,
+                                         SM75_U16x8_LDSM_N>::type;
 };
 
 template <int N, int K, int num_warp_n>
@@ -211,7 +212,8 @@ struct OperandTraits<16, N, K, false, num_warp_n,
       Swizzle<3, 3, 3>{}, Layout<Shape<_64, _8>, Stride<_1, _64>>{}));
   using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{},
                                         Step<_2, _1>{}));
-  using Copy = SM75_U16x8_LDSM_T;
+  using Copy = typename std::conditional<N == 8 * num_warp_n, SM75_U16x4_LDSM_T,
+                                         SM75_U16x8_LDSM_T>::type;
 };
 
 template <int N, int K, int num_warp_n>

diff --git a/src/tl_templates/cuda/gemm_sm90.h b/src/tl_templates/cuda/gemm_sm90.h
@@ -255,7 +255,8 @@ struct OperandTraits<16, N, K, false, num_warp_n,
       Swizzle<2, 3, 3>{}, Layout<Shape<_32, _8>, Stride<_1, _32>>{}));
   using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{},
                                         Step<_2, _1>{}));
-  using Copy = SM75_U16x8_LDSM_T;
+  using Copy = typename std::conditional<N == 8 * num_warp_n, SM75_U16x4_LDSM_N,
+                                         SM75_U16x8_LDSM_N>::type;
 };
 
 template <int N, int K, int num_warp_n>
@@ -265,7 +266,8 @@ struct OperandTraits<16, N, K, false, num_warp_n,
       Swizzle<3, 3, 3>{}, Layout<Shape<_64, _8>, Stride<_1, _64>>{}));
   using Layout = decltype(tile_to_shape(LayoutAtom{}, Shape<Int<N>, Int<K>>{},
                                         Step<_2, _1>{}));
-  using Copy = SM75_U16x8_LDSM_T;
+  using Copy = typename std::conditional<N == 8 * num_warp_n, SM75_U16x4_LDSM_N,
+                                         SM75_U16x8_LDSM_N>::type;
 };
 
 template <int N, int K, int num_warp_n>