flashinfer-ai · yzh119 · Nov 25, 2025 · Nov 24, 2025 · Nov 24, 2025 · Nov 24, 2025
@@ -87,7 +87,7 @@ class ArtifactPath:
     When compiling new cubins for backend directories, update the corresponding path.
     """
 
-    TRTLLM_GEN_FMHA: str = "1e49deb33ec20018ae0acf1d956a579578069da1/fmha/trtllm-gen/"
+    TRTLLM_GEN_FMHA: str = "9f1b6ddaa1592a8339a82fcab7d27a57eff445fd/fmha/trtllm-gen/"
     TRTLLM_GEN_BMM: str = (
         "c108f5cc46420e11805467898186533fb48d6a6f/batched_gemm-0d28130-7b26988"
     )
@@ -107,7 +107,7 @@ class CheckSumHash:
     """
 
     TRTLLM_GEN_FMHA: str = (
-        "66757498f573430583d63b04c02bf9e38306eefe2ce31df9b5d923d99bd15d84"
+        "a5a60600a80076317703695f56bbef2f0a44075ef4e24d7b06ba67ff68bc9da2"
     )
     TRTLLM_GEN_BMM: str = (
         "85a4516b7ab25b1a6495398ae934a00e30ccd6662b9ec27be1330d7bba5e1ddf"

diff --git a/tests/attention/test_trtllm_gen_attention.py b/tests/attention/test_trtllm_gen_attention.py
@@ -414,13 +414,13 @@ def _test_trtllm_batch_prefill(
     max_q_len,
     max_kv_len,
     device_scale,
+    head_dim,
 ):
     compute_capability = get_compute_capability(torch.device(device="cuda"))
     if compute_capability[0] != 10:
         pytest.skip("These tests are only guaranteed to work on SM100 and SM103 GPUs.")
     # Set up test parameters
     torch.manual_seed(0)
-    head_dim = 128
 
     # Generate random sequence lengths
     num_qo_heads = num_kv_heads * head_grp_size
@@ -639,6 +639,7 @@ def _test_trtllm_batch_prefill(
 @pytest.mark.parametrize("enable_sink", [True, False])
 @pytest.mark.parametrize("max_q_len", [511])
 @pytest.mark.parametrize("max_kv_len", [2047])
+@pytest.mark.parametrize("head_dim", [128, 256])
 def test_trtllm_batch_prefill(
     kv_layout,
     batch_size,
@@ -653,6 +654,7 @@ def test_trtllm_batch_prefill(
     enable_sink,
     max_q_len,
     max_kv_len,
+    head_dim,
 ):
     _test_trtllm_batch_prefill(
         kv_layout,
@@ -669,6 +671,7 @@ def test_trtllm_batch_prefill(
         max_q_len,
         max_kv_len,
         kv_dtype == "fp8",
+        head_dim,
     )
 
 
@@ -690,6 +693,7 @@ def test_trtllm_batch_prefill(
 @pytest.mark.parametrize("enable_sink", [False])
 @pytest.mark.parametrize("max_q_len", [8192])
 @pytest.mark.parametrize("max_kv_len", [8192])
+@pytest.mark.parametrize("head_dim", [128, 256])
 def test_trtllm_batch_prefill_bs1(
     kv_layout,
     batch_size,
@@ -704,6 +708,7 @@ def test_trtllm_batch_prefill_bs1(
     enable_sink,
     max_q_len,
     max_kv_len,
+    head_dim,
 ):
     _test_trtllm_batch_prefill(
         kv_layout,
@@ -720,6 +725,7 @@ def test_trtllm_batch_prefill_bs1(
         max_q_len,
         max_kv_len,
         False,
+        head_dim,
     )
 
 
@@ -1202,7 +1208,6 @@ def test_trtllm_batch_decode_head_dim_256(
     device_scale,
 ):
     # Small number of test cases for head_dim = 256
-    pytest.xfail("trtllm-gen decode gets incorrect output with head_dim = 256")
     _test_trtllm_batch_decode(
         "trtllm-gen",
         kv_layout,

diff --git a/tests/comm/test_trtllm_mnnvl_allreduce_custom_comm.py b/tests/comm/test_trtllm_mnnvl_allreduce_custom_comm.py
@@ -250,7 +250,7 @@ def test_mnnvl_allreduce_custom_communicator(
 
     available_gpus = torch.cuda.device_count()
     if world_size > available_gpus:
-        raise ValueError(
+        pytest.skip(
             f"world_size {world_size} is greater than available_gpus {available_gpus}"
         )
     print(f"Running test for world_size={world_size}")