feat: experimental support of green ctx (#1163)

yzh119 · web-flow · commit 27060628c32e · 2025-06-22T23:43:09.000-07:00
## 📌 Description Use cuda-python bindings to create green context for spliting SM resources. Co-authored-by: Yi Pan <conlesspan@outlook.com> ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.). ## Reviewer Notes cc @Conless
diff --git a/docs/api/green_ctx.rst b/docs/api/green_ctx.rst
@@ -0,0 +1,8 @@
+.. _apigreenctx:
+
+flashinfer.green_ctx
+====================
+
+.. currentmodule:: flashinfer.green_ctx
+
+.. autofunction:: split_device_green_ctx
diff --git a/docs/index.rst b/docs/index.rst
@@ -40,3 +40,4 @@ FlashInfer is a library and kernel generator for Large Language Models that prov
    api/rope
    api/activation
    api/quantization
+   api/green_ctx
diff --git a/flashinfer/green_ctx.py b/flashinfer/green_ctx.py
@@ -0,0 +1,154 @@
+"""
+Copyright (c) 2025 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from typing import List, Tuple
+
+import cuda.bindings.driver as driver
+import cuda.bindings.runtime as runtime
+import cuda.cudart as cudart
+import cuda.nvrtc as nvrtc
+import torch
+from cuda.bindings.driver import CUdevice, CUdevResource
+
+
+def _cudaGetErrorEnum(error):
+    if isinstance(error, driver.CUresult):
+        err, name = driver.cuGetErrorName(error)
+        return name if err == driver.CUresult.CUDA_SUCCESS else "<unknown>"
+    elif isinstance(error, runtime.cudaError_t):
+        return cudart.cudaGetErrorName(error)[1]
+    elif isinstance(error, nvrtc.nvrtcResult):
+        return nvrtc.nvrtcGetErrorString(error)[1]
+    else:
+        raise RuntimeError(f"Unknown error type: {error}")
+
+
+def checkCudaErrors(result):
+    if result[0].value:
+        raise RuntimeError(
+            f"CUDA error code={result[0].value}({_cudaGetErrorEnum(result[0])})"
+        )
+    if len(result) == 1:
+        return None
+    elif len(result) == 2:
+        return result[1]
+    else:
+        return result[1:]
+
+
+def get_cudevice(dev: torch.device) -> CUdevice:
+    try:
+        cu_dev = checkCudaErrors(driver.cuDeviceGet(dev.index))
+    except RuntimeError as e:
+        runtime.cudaInitDevice(dev.index, 0, 0)
+        cu_dev = checkCudaErrors(driver.cuDeviceGet(dev.index))
+    return cu_dev
+
+
+def get_device_resource(cu_dev: CUdevice) -> CUdevResource:
+    return checkCudaErrors(
+        driver.cuDeviceGetDevResource(
+            cu_dev, driver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_SM
+        )
+    )
+
+
+def split_resource(
+    resource: CUdevResource,
+    num_groups: int,
+    min_count: int,
+) -> Tuple[CUdevResource, CUdevResource]:
+    results, _, remaining = checkCudaErrors(
+        driver.cuDevSmResourceSplitByCount(
+            num_groups,
+            resource,
+            0,  # useFlags
+            min_count,
+        )
+    )
+    return results, remaining
+
+
+def create_green_ctx_streams(
+    cu_dev: CUdevResource, resources: List[CUdevResource]
+) -> List[torch.Stream]:
+    streams = []
+    for split in resources:
+        desc = checkCudaErrors(driver.cuDevResourceGenerateDesc([split], 1))
+        green_ctx = checkCudaErrors(
+            driver.cuGreenCtxCreate(
+                desc, cu_dev, driver.CUgreenCtxCreate_flags.CU_GREEN_CTX_DEFAULT_STREAM
+            )
+        )
+        stream = checkCudaErrors(
+            driver.cuGreenCtxStreamCreate(
+                green_ctx,
+                driver.CUstream_flags.CU_STREAM_NON_BLOCKING,
+                0,  # priority
+            )
+        )
+        streams.append(torch.cuda.get_stream_from_external(stream))
+
+    return streams
+
+
+def split_device_green_ctx(
+    dev: torch.device, num_groups: int, min_count: int
+) -> Tuple[List[torch.Stream], List[CUdevResource]]:
+    r"""
+    Split the device into multiple `green contexts <https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GREEN__CONTEXTS.html>`_,
+    return the corresponding streams and `CUdevResource` for each group and the remaining SMs.
+    Green contexts allow concurrent execution of multiple kernels on different SM partitions.
+
+    Args:
+        dev: The device to split.
+        num_groups: The number of groups to split the device into.
+        min_count: Minimum number of SMs required for each group, it will be adjusted to meet the
+            alignment and granularity requirements.
+
+    Returns:
+        streams: The list of torch.Streams objects corresponding to the green contexts.
+        resources: The list of CUdevResource objects corresponding to the green contexts.
+
+    Example:
+        >>> from flashinfer.green_ctx import split_device_green_ctx
+        >>> import torch
+        >>> dev = torch.device("cuda:0")
+        >>> streams, resources = split_device_green_ctx(dev, 2, 16)
+        >>> print([r.sm.smCount for r in resources])
+        [16, 16, 100]
+        >>> with torch.cuda.stream(streams[0]):
+        ...     x = torch.randn(8192, 8192, device=dev, dtype=torch.bfloat16)
+        ...     y = torch.randn(8192, 8192, device=dev, dtype=torch.bfloat16)
+        ...     z = x @ y
+        ...     print(z.shape)
+        ...
+        torch.Size([8192, 8192])
+
+    Note:
+        The length of the returned streams and resources is ``num_groups + 1``,
+        where the last one is the remaining SMs.
+
+    Raises:
+        RuntimeError: when requested SM allocation exceeds device capacity:
+        ``num_groups * round_up(min_count, 8) > num_sm``
+    """
+    cu_dev = get_cudevice(dev)
+    resource = get_device_resource(cu_dev)
+    results, remaining = split_resource(resource, num_groups, min_count)
+    resources = results + [remaining]
+    streams = create_green_ctx_streams(cu_dev, resources)
+    return streams, resources
diff --git a/setup.py b/setup.py
@@ -51,7 +51,7 @@ def generate_build_meta(aot_build_meta: dict) -> None:
 
 ext_modules = []
 cmdclass = {}
-install_requires = ["numpy", "torch", "ninja", "requests"]
+install_requires = ["numpy", "torch", "ninja", "requests", "cuda-python"]
 generate_build_meta({})
 
 if enable_aot:
diff --git a/tests/test_green_ctx.py b/tests/test_green_ctx.py
@@ -0,0 +1,45 @@
+import pytest
+import torch
+
+import flashinfer.green_ctx as green_ctx
+
+
+@pytest.mark.parametrize("device", ["cuda:0"])
+@pytest.mark.parametrize("num_groups", [1, 2, 3])
+@pytest.mark.parametrize("min_count", [16, 32])
+def test_green_ctx_creation(
+    device: str,
+    num_groups: int,
+    min_count: int,
+):
+    streams, resources = green_ctx.split_device_green_ctx(
+        torch.device(device), num_groups, min_count
+    )
+
+    assert len(resources) == num_groups + 1
+    for resource in resources[:-1]:
+        sm_count = resource.sm.smCount
+        assert sm_count >= min_count
+
+
+@pytest.mark.parametrize("device", ["cuda:0"])
+@pytest.mark.parametrize("num_groups", [1, 2, 3])
+@pytest.mark.parametrize("min_count", [16, 32])
+def test_green_ctx_kernel_execution(
+    device: str,
+    num_groups: int,
+    min_count: int,
+):
+    streams, resources = green_ctx.split_device_green_ctx(
+        torch.device(device), num_groups, min_count
+    )
+    num_partitions = num_groups + 1
+    assert len(streams) == num_partitions
+    assert len(resources) == num_partitions
+
+    for stream in streams:
+        with torch.cuda.stream(stream):
+            x = torch.randn(8192, 8192, device=device, dtype=torch.bfloat16)
+            y = torch.randn(8192, 8192, device=device, dtype=torch.bfloat16)
+            z = x @ y
+            print(z.shape)