ci: add guard for aot compilation (#1127)

yzh119 · web-flow · commit bc50f1a305b0 · 2025-06-07T08:47:23.000-07:00
## 📌 Description Do not compile sm90a/sm100a kernels for CUDA version lower than certain threshold. ## 🔍 Related Issues Check the failed jobs https://github.com/flashinfer-ai/flashinfer/actions/runs/15504091163/job/43657444651 ## Reviewer Notes cc @abcdabcd987 @wenscarl
diff --git a/flashinfer/aot.py b/flashinfer/aot.py
@@ -31,6 +31,7 @@
 from .quantization import gen_quantization_module
 from .rope import gen_rope_module
 from .sampling import gen_sampling_module
+from .utils import version_at_least
 
 
 def gen_fa2(
@@ -482,8 +483,12 @@ def main():
     if "TORCH_CUDA_ARCH_LIST" not in os.environ:
         raise RuntimeError("Please explicitly set env var TORCH_CUDA_ARCH_LIST.")
     gencode_flags = _get_cuda_arch_flags()
-    has_sm90 = any("compute_90" in flag for flag in gencode_flags)
-    has_sm100 = any("compute_100" in flag for flag in gencode_flags)
+    has_sm90 = any("compute_90" in flag for flag in gencode_flags) and version_at_least(
+        torch.version.cuda, "12.3"
+    )
+    has_sm100 = any(
+        "compute_100" in flag for flag in gencode_flags
+    ) and version_at_least(torch.version.cuda, "12.8")
 
     # Update data dir
     jit_env.FLASHINFER_CSRC_DIR = project_root / "csrc"
@@ -528,38 +533,41 @@ def main():
                 jit_env.SPDLOG_INCLUDE_DIR,
                 jit_env.FLASHINFER_INCLUDE_DIR,
             ],
-        ),
-        gen_jit_spec(
-            "trtllm_utils",
-            [
-                jit_env.FLASHINFER_CSRC_DIR
-                / "nv_internal"
-                / "tensorrt_llm"
-                / "kernels"
-                / "delayStream.cu",
-            ],
-            extra_include_paths=[
-                jit_env.FLASHINFER_CSRC_DIR / "nv_internal",
-                jit_env.FLASHINFER_CSRC_DIR / "nv_internal" / "include",
-                jit_env.FLASHINFER_CSRC_DIR
-                / "nv_internal"
-                / "tensorrt_llm"
-                / "cutlass_extensions"
-                / "include",
-                jit_env.FLASHINFER_CSRC_DIR
-                / "nv_internal"
-                / "tensorrt_llm"
-                / "kernels"
-                / "internal_cutlass_kernels"
-                / "include",
-                jit_env.FLASHINFER_CSRC_DIR
-                / "nv_internal"
-                / "tensorrt_llm"
-                / "kernels"
-                / "internal_cutlass_kernels",
-            ],
-        ),
+        )
     ]
+    if has_sm90:
+        jit_specs.append(
+            gen_jit_spec(
+                "trtllm_utils",
+                [
+                    jit_env.FLASHINFER_CSRC_DIR
+                    / "nv_internal"
+                    / "tensorrt_llm"
+                    / "kernels"
+                    / "delayStream.cu",
+                ],
+                extra_include_paths=[
+                    jit_env.FLASHINFER_CSRC_DIR / "nv_internal",
+                    jit_env.FLASHINFER_CSRC_DIR / "nv_internal" / "include",
+                    jit_env.FLASHINFER_CSRC_DIR
+                    / "nv_internal"
+                    / "tensorrt_llm"
+                    / "cutlass_extensions"
+                    / "include",
+                    jit_env.FLASHINFER_CSRC_DIR
+                    / "nv_internal"
+                    / "tensorrt_llm"
+                    / "kernels"
+                    / "internal_cutlass_kernels"
+                    / "include",
+                    jit_env.FLASHINFER_CSRC_DIR
+                    / "nv_internal"
+                    / "tensorrt_llm"
+                    / "kernels"
+                    / "internal_cutlass_kernels",
+                ],
+            ),
+        )
     jit_specs += gen_all_modules(
         f16_dtype_,
         f8_dtype_,
diff --git a/flashinfer/utils.py b/flashinfer/utils.py
@@ -394,14 +394,20 @@ def determine_attention_backend(
         return "fa2"
 
 
+def version_at_least(version: str, base_version: str) -> bool:
+    from packaging import version as pkg_version
+
+    return pkg_version.parse(version) >= pkg_version.parse(base_version)
+
+
 def is_sm90a_supported(device: torch.device) -> bool:
     major, _ = get_compute_capability(device)
-    return major == 9 and torch.version.cuda >= "12.3"
+    return major == 9 and version_at_least(torch.version.cuda, "12.3")
 
 
 def is_sm100a_supported(device: torch.device) -> bool:
     major, _ = get_compute_capability(device)
-    return major == 10 and torch.version.cuda >= "12.8"
+    return major == 10 and version_at_least(torch.version.cuda, "12.8")
 
 
 def determine_mla_backend(device: torch.device) -> str: