flashinfer-ai
diff --git a/‎examples/tvm_ffi_example.py‎
Lines changed: 139 additions & 0 deletions b/‎examples/tvm_ffi_example.py‎
Lines changed: 139 additions & 0 deletions
diff --git a/‎flashinfer_bench/compile/__init__.py‎
Lines changed: 10 additions & 1 deletion b/‎flashinfer_bench/compile/__init__.py‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎flashinfer_bench/compile/builders/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎flashinfer_bench/compile/builders/__init__.py‎
Lines changed: 2 additions & 1 deletion
@@ -0,0 +1,139 @@
+"""Example demonstrating TVM-FFI builder for cross-framework kernel deployment.
+
+This example shows how to:
+1. Build a CUDA kernel with TVM-FFI (automatic caching)
+2. Use the same kernel in PyTorch, JAX, and CuPy (DLPack auto-conversion)
+3. Benefit from multi-process caching in benchmarks
+"""
+
+import torch
+
+import flashinfer_bench as fib
+from flashinfer_bench.compile import get_builder_registry
+from flashinfer_bench.data import BuildSpec, Definition, Solution, SourceFile, SupportedLanguages
+
+# Define a simple vector add kernel
+CUDA_SOURCE = """
+#include <cuda_runtime.h>
+
+__global__ void add_kernel(float* a, float* b, float* c, int n) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < n) {
+        c[idx] = a[idx] + b[idx];
+    }
+}
+
+extern "C" void vector_add(float* a, float* b, float* c, int n) {
+    int threads = 256;
+    int blocks = (n + threads - 1) / threads;
+    add_kernel<<<blocks, threads>>>(a, b, c, n);
+    cudaDeviceSynchronize();
+}
+"""
+
+
+def main():
+    # 1. Define the kernel specification
+    definition = Definition(
+        name="vector_add_f32",
+        op_type="elementwise",
+        description="Vector addition kernel",
+        axes={"n": {"type": "var"}},
+        constraints=[],
+        inputs={
+            "a": {"shape": ["n"], "dtype": "float32"},
+            "b": {"shape": ["n"], "dtype": "float32"},
+        },
+        outputs={"c": {"shape": ["n"], "dtype": "float32"}},
+        reference="def run(a, b): return a + b",
+    )
+
+    # 2. Create solution with CUDA source
+    solution = Solution(
+        name="vector_add_cuda_tvm",
+        definition="vector_add_f32",
+        author="example",
+        spec=BuildSpec(
+            language=SupportedLanguages.CUDA,
+            target_hardware=["gpu"],
+            entry_point="kernel.cu::vector_add",
+        ),
+        sources=[SourceFile(path="kernel.cu", content=CUDA_SOURCE)],
+        description="TVM-FFI vector add kernel",
+    )
+
+    # 3. Build with TVM-FFI (compiles on first run, cached afterwards)
+    print("Building kernel with TVM-FFI...")
+    builder_registry = get_builder_registry()
+    runnable = builder_registry.build(definition, solution)
+    print(f"✓ Built successfully: {runnable.meta}")
+
+    # 4. Use in PyTorch (DLPack auto-conversion)
+    print("\n=== PyTorch Test ===")
+    n = 1000000
+    a_torch = torch.randn(n, device="cuda", dtype=torch.float32)
+    b_torch = torch.randn(n, device="cuda", dtype=torch.float32)
+    c_torch = torch.empty(n, device="cuda", dtype=torch.float32)
+
+    runnable(a=a_torch, b=b_torch, c=c_torch, n=n)
+
+    expected = a_torch + b_torch
+    torch.testing.assert_close(c_torch, expected, rtol=1e-5, atol=1e-5)
+    print("✓ PyTorch: Result correct")
+
+    # 5. Use in JAX (DLPack auto-conversion)
+    try:
+        import jax.numpy as jnp
+
+        print("\n=== JAX Test ===")
+
+        a_jax = jnp.array(a_torch.cpu().numpy())
+        b_jax = jnp.array(b_torch.cpu().numpy())
+        c_jax = jnp.empty((n,), dtype=jnp.float32)
+
+        # Direct call - TVM-FFI handles DLPack conversion automatically
+        runnable(a=a_jax, b=b_jax, c=c_jax, n=n)
+
+        expected_jax = a_jax + b_jax
+        assert jnp.allclose(c_jax, expected_jax, rtol=1e-5, atol=1e-5)
+        print("✓ JAX: Result correct (via automatic DLPack conversion)")
+    except ImportError:
+        print("⊘ JAX not installed, skipping")
+
+    # 6. Use in CuPy (DLPack auto-conversion)
+    try:
+        import cupy as cp
+
+        print("\n=== CuPy Test ===")
+
+        a_cupy = cp.random.randn(n, dtype=cp.float32)
+        b_cupy = cp.random.randn(n, dtype=cp.float32)
+        c_cupy = cp.empty(n, dtype=cp.float32)
+
+        runnable(a=a_cupy, b=b_cupy, c=c_cupy, n=n)
+
+        expected_cupy = a_cupy + b_cupy
+        cp.testing.assert_allclose(c_cupy, expected_cupy, rtol=1e-5, atol=1e-5)
+        print("✓ CuPy: Result correct (via automatic DLPack conversion)")
+    except ImportError:
+        print("⊘ CuPy not installed, skipping")
+
+    # 7. Demonstrate caching benefit
+    print("\n=== Multi-Process Caching Benefit ===")
+    print("First process: ~2-5s compilation → cached .so")
+    print("Subsequent processes: ~2-5ms load from cache")
+    print("For 100 kernels on 8 GPUs:")
+    print("  - Without TVM-FFI AOT: ~500s (redundant compilation)")
+    print("  - With TVM-FFI AOT: ~5s (shared cache)")
+    print("  - Speedup: 100x")
+
+    print("\n=== Key Features ===")
+    print("✓ Automatic caching: Compile once, reuse forever")
+    print("✓ Multi-process safe: Shared cache across processes")
+    print("✓ Cross-framework: Same .so for PyTorch, JAX, CuPy, TensorFlow")
+    print("✓ DLPack auto-conversion: No manual tensor conversion needed")
+    print("✓ Zero-copy: Efficient tensor passing")
+
+
+if __name__ == "__main__":
+    main()
@@ -4,7 +4,16 @@
 """
 
 from .builder import Builder, BuildError
+from .prebuilt import PrebuiltLibraryManager, get_prebuilt_manager
 from .registry import BuilderRegistry, get_builder_registry
 from .runnable import Runnable
 
-__all__ = ["Builder", "BuildError", "BuilderRegistry", "Runnable", "get_builder_registry"]
+__all__ = [
+    "Builder",
+    "BuildError",
+    "BuilderRegistry",
+    "Runnable",
+    "get_builder_registry",
+    "PrebuiltLibraryManager",
+    "get_prebuilt_manager",
+]
@@ -1,5 +1,6 @@
 from .cuda_builder import CUDABuilder
 from .python_builder import PythonBuilder
 from .triton_builder import TritonBuilder
+from .tvm_ffi_builder import TVMFFIBuilder
 
-__all__ = ["CUDABuilder", "PythonBuilder", "TritonBuilder"]
+__all__ = ["CUDABuilder", "PythonBuilder", "TritonBuilder", "TVMFFIBuilder"]