[FlashInfer] Update include path and interface (#18317)

MasterJH5574 · web-flow · commit 0e055fcef4e3 · 2025-09-15T16:14:22.000-04:00
This PR updates the include path for FlashInfer JIT compilation, and also updates the plan function interface for attention prefill computation, to align with recent interface change in flashinfer-ai/flashinfer#1661.
diff --git a/python/tvm/relax/backend/cuda/flashinfer.py b/python/tvm/relax/backend/cuda/flashinfer.py
@@ -141,8 +141,8 @@ def get_object_file_path(src: Path) -> Path:
         )
         include_paths += [
             Path(tvm_home).resolve() / "include",
-            Path(tvm_home).resolve() / "ffi" / "include",
-            Path(tvm_home).resolve() / "ffi" / "3rdparty" / "dlpack" / "include",
+            Path(tvm_home).resolve() / "3rdparty" / "tvm-ffi" / "include",
+            Path(tvm_home).resolve() / "3rdparty" / "tvm-ffi" / "3rdparty" / "dlpack" / "include",
             Path(tvm_home).resolve() / "3rdparty" / "dmlc-core" / "include",
         ]
     else:
@@ -160,8 +160,13 @@ def get_object_file_path(src: Path) -> Path:
             # The package is installed from source.
             include_paths += [
                 tvm_package_path.parent.parent / "include",
-                tvm_package_path.parent.parent / "ffi" / "include",
-                tvm_package_path.parent.parent / "ffi" / "3rdparty" / "dlpack" / "include",
+                tvm_package_path.parent.parent / "3rdparty" / "tvm-ffi" / "include",
+                tvm_package_path.parent.parent
+                / "3rdparty"
+                / "tvm-ffi"
+                / "3rdparty"
+                / "dlpack"
+                / "include",
                 tvm_package_path.parent.parent / "3rdparty" / "dmlc-core" / "include",
             ]
         else:
diff --git a/src/runtime/vm/attn_backend.h b/src/runtime/vm/attn_backend.h
@@ -176,7 +176,8 @@ class FlashInferPagedPrefillFunc : public PagedPrefillFunc {
           plan_func_(float_workspace_buffer, int_workspace_buffer, page_locked_int_workspace_buffer,
                      qo_indptr->as_tensor(), page_indptr->as_tensor(), IntTuple(std::move(kv_len)),
                      total_qo_len, batch_size, num_qo_heads, num_kv_heads, page_size,
-                     /*enable_cuda_graph=*/false, qk_head_dim, v_head_dim, causal, copy_stream)
+                     /*enable_cuda_graph=*/false, qk_head_dim, v_head_dim, causal,
+                     /*window_left=*/-1, copy_stream)
               .cast<IntTuple>();
     } else if (attn_kind == AttnKind::kMLA) {
       plan_info_vec =
@@ -280,7 +281,8 @@ class FlashInferRaggedPrefillFunc : public RaggedPrefillFunc {
         plan_func_(float_workspace_buffer, int_workspace_buffer, page_locked_int_workspace_buffer,
                    qo_indptr->as_tensor(), kv_indptr->as_tensor(), IntTuple(std::move(kv_len)),
                    total_qo_len, batch_size, num_qo_heads, num_kv_heads, /*page_size=*/1,
-                   /*enable_cuda_graph=*/false, qk_head_dim, v_head_dim, causal, copy_stream)
+                   /*enable_cuda_graph=*/false, qk_head_dim, v_head_dim, causal,
+                   /*window_left=*/-1, copy_stream)
             .cast<IntTuple>();
   }