vllm-project · tjtanaa · Feb 26, 2025 · Mar 5, 2025 · Mar 17, 2025 · Mar 17, 2025
diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
@@ -14,6 +14,7 @@
 from vllm.model_executor.layers.layernorm import (
     RMSNorm, dispatch_cuda_rmsnorm_func, fused_add_rms_norm, rms_norm,
     rocm_aiter_fused_add_rms_norm, rocm_aiter_rms_norm)
+from vllm.model_executor.layers.utils import dispatch_unquantized_gemm
 from vllm.platforms import current_platform
 
 
@@ -96,6 +97,27 @@ def test_enabled_ops_invalid(env: str):
             RMSNorm(1024).enabled()
 
 
+@pytest.mark.skipif(not current_platform.is_rocm(),
+                    reason="AITER is a feature exclusive for ROCm")
+@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
+@pytest.mark.parametrize("use_rocm_aiter_linear", ["0", "1"])
+def test_unquantized_linear_dispatch(use_rocm_aiter: str,
+                                     use_rocm_aiter_linear: str, monkeypatch):
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER_LINEAR", use_rocm_aiter_linear)
+
+    linear_func = dispatch_unquantized_gemm()
+    print(f"use_rocm_aiter: {use_rocm_aiter}, " +
+          f"use_rocm_aiter_linear: {use_rocm_aiter_linear}")
+    if current_platform.is_rocm() and int(use_rocm_aiter) and int(
+            use_rocm_aiter_linear):
+        from vllm._aiter_ops import aiter_ops
+        assert linear_func == aiter_ops.rocm_aiter_tuned_gemm
+    else:
+        from vllm.model_executor.layers.utils import rocm_unquantized_gemm
+        assert linear_func == rocm_unquantized_gemm
+
+
 @pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
 def test_topk_dispatch(use_rocm_aiter: str, monkeypatch):
     monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)

@@ -27,7 +27,6 @@
     "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
 def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,
                             use_rocm_aiter: bool, monkeypatch) -> None:
-
     if use_rocm_aiter:
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 

diff --git a/tests/v1/rocm/test_aiter_ops.py b/tests/v1/rocm/test_aiter_ops.py
@@ -0,0 +1,145 @@
+# SPDX-License-Identifier: Apache-2.0
+# This is a test for the aiter ops.
+# It tests if the aiter ops are
+# 1. correctly registered as custom ops
+# 2. correctly defined the relationship between
+#    implementation and fake function
+# 3. can be used with torch.compile
+# This file will be skipped if aiter is not installed
+# and the platform is not ROCm.
+#
+# NOTE:
+# This unit tests is by no means to check the
+# correctness of the aiter ops. It only checks if the
+# aiter ops are correctly registered and if torch.compile
+# can be used with the aiter ops.
+# The correctness of the aiter ops is tested in the
+# https://github.com/ROCm/aiter
+
+import importlib.util
+
+import pytest
+import torch
+
+from vllm._aiter_ops import aiter_ops
+from vllm.platforms import current_platform
+
+# Check if aiter package is installed
+aiter_available = importlib.util.find_spec("aiter") is not None
+
+pytestmark = pytest.mark.skipif(
+    not (current_platform.is_rocm() and aiter_available),
+    reason="AITER ops are only available on ROCm with aiter package installed")
+
+
+def test_rocm_aiter_tuned_gemm_custom_op_registration():
+    """Test that the custom op is correctly registered."""
+    # Check if the op exists in torch.ops.vllm
+    assert hasattr(torch.ops.vllm, 'rocm_aiter_tuned_gemm')
+
+    # Check if the op is callable
+    assert callable(torch.ops.vllm.rocm_aiter_tuned_gemm)
+
+
+def test_rocm_aiter_tuned_gemm_torch_compile_compatibility():
+    """Test that the op can be used with torch.compile."""
+    # Create test tensors
+    input_tensor = torch.randn(64, 32, dtype=torch.float16, device='cuda')
+    weight_tensor = torch.randn(16, 32, dtype=torch.float16, device='cuda')
+
+    # Define a function that uses the op
+    def gemm_fn(x, w):
+        return aiter_ops.rocm_aiter_tuned_gemm(x, w)
+
+    # Verify the op's fake implementation
+    torch.library.opcheck(torch.ops.vllm.rocm_aiter_tuned_gemm,
+                          (input_tensor, weight_tensor),
+                          test_utils=("test_schema", "test_faketensor"))
+
+    # Compile the function with appropriate settings based on
+    # vllm/compilation/wrapper.py
+    compiled_fn = torch.compile(gemm_fn,
+                                fullgraph=True,
+                                backend="inductor",
+                                mode="reduce-overhead",
+                                dynamic=False)
+
+    # Run both compiled (V1 graph mode) and uncompiled versions (V1 eager mode)
+    result_original = gemm_fn(input_tensor, weight_tensor)
+    result_compiled = compiled_fn(input_tensor, weight_tensor)
+
+    # Verify results match
+    assert torch.allclose(result_original, result_compiled)
+
+
+def test_rocm_aiter_tuned_gemm_torch_compile_fp8_compatibility():
+
+    input_tensor = torch.randn(64, 32, dtype=torch.float16, device='cuda')
+    weight_tensor = torch.randn(16, 32, dtype=torch.float16, device='cuda')
+
+    input_fp8 = input_tensor.to(current_platform.fp8_dtype())
+    weight_fp8 = weight_tensor.to(current_platform.fp8_dtype())
+
+    scale_a = torch.tensor(10.0, device='cuda')
+    scale_b = torch.tensor(0.5, device='cuda')
+
+    # Define a function that uses the op with FP8 and scales
+    def gemm_fp8_fn(x, w, scale_a, scale_b):
+        return aiter_ops.rocm_aiter_tuned_gemm(x,
+                                               w,
+                                               out_dtype=torch.float16,
+                                               scale_a=scale_a,
+                                               scale_b=scale_b)
+
+    # Verify the op's fake implementation with FP8 inputs
+    # Disable test_schema as fp8 datatype is not supported by
+    # torch.library.opcheck
+    # Related error:
+    #      OpCheckError: opcheck(op, ...): test_schema failed with
+    #      "mul_cuda" not implemented for 'Float8_e4m3fnuz'
+    torch.library.opcheck(torch.ops.vllm.rocm_aiter_tuned_gemm,
+                          (input_fp8, weight_fp8),
+                          kwargs={
+                              "out_dtype": torch.float16,
+                              "scale_a": scale_a,
+                              "scale_b": scale_b
+                          },
+                          test_utils=("test_faketensor"))
+
+    # Compile the function with appropriate settings based on
+    # vllm/compilation/wrapper.py
+    compiled_fp8_fn = torch.compile(gemm_fp8_fn,
+                                    fullgraph=True,
+                                    backend="inductor",
+                                    mode="reduce-overhead",
+                                    dynamic=False)
+
+    # Run both compiled (V1 graph mode) and uncompiled versions (V1 eager mode)
+    result_original = gemm_fp8_fn(input_fp8, weight_fp8, scale_a, scale_b)
+    result_compiled = compiled_fp8_fn(input_fp8, weight_fp8, scale_a, scale_b)
+
+    # Verify results match and have correct properties
+    assert torch.allclose(result_original, result_compiled)
+    assert result_original.dtype == torch.float16
+    assert result_compiled.dtype == torch.float16
+    assert result_original.shape == (64, 16)
+    assert result_compiled.shape == (64, 16)
+
+    # Get unscaled result
+    unscaled_result = aiter_ops.rocm_aiter_tuned_gemm(
+        input_fp8.to(torch.float16),
+        weight_fp8.to(torch.float16),
+        out_dtype=torch.float16)
+
+    # Verify that scaling was applied correctly
+    # The scaled result should be approximately equal to the
+    # unscaled result multiplied by the scales
+    expected_scaled = unscaled_result * (scale_a * scale_b)
+    assert torch.allclose(result_original,
+                          expected_scaled,
+                          rtol=1e-2,
+                          atol=1e-2)
+
+    # Verify that scaled and unscaled results are different
+    assert not torch.allclose(
+        result_original, unscaled_result, rtol=1e-2, atol=1e-2)
diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+
+import torch
+
+from vllm.platforms import current_platform
+from vllm.utils import direct_register_custom_op
+
+
+def rocm_aiter_tuned_gemm_impl(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+        out_dtype: Optional[torch.dtype] = None,
+        scale_a: Optional[torch.Tensor] = None,
+        scale_b: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+    # This AITER function can be used for
+    # - BF16 and FP16 matmul
+    #   e.g. vllm/model_executor/layers/linear.py
+    # - per-tensor activations + per-tensor weights
+    #   e.g. vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+    from aiter.tuned_gemm import tgemm as aiter_tgemm
+
+    return aiter_tgemm.mm(input,
+                          weight,
+                          otype=out_dtype,
+                          scale_a=scale_a,
+                          scale_b=scale_b,
+                          bias=bias)
+
+
+def rocm_aiter_tuned_gemm_fake(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+        out_dtype: Optional[torch.dtype] = None,
+        scale_a: Optional[torch.Tensor] = None,
+        scale_b: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+    m = input.shape[0]
+    n = weight.shape[0]
+    if out_dtype is None:
+        out_dtype = input.dtype
+    return torch.empty((m, n), dtype=out_dtype, device=input.device)
+
+
+if current_platform.is_rocm():
+    direct_register_custom_op(
+        op_name="rocm_aiter_tuned_gemm",
+        op_func=rocm_aiter_tuned_gemm_impl,
+        mutates_args=[],
+        fake_impl=rocm_aiter_tuned_gemm_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+
+
+class aiter_ops:
+
+    @staticmethod
+    def rocm_aiter_tuned_gemm(
+            input: torch.Tensor,  # [M, K]
+            weight: torch.Tensor,  # [N, K]
+            bias: Optional[torch.Tensor] = None,
+            out_dtype: Optional[torch.dtype] = None,
+            scale_a: Optional[torch.Tensor] = None,
+            scale_b: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        return torch.ops.vllm.rocm_aiter_tuned_gemm(
+            input,
+            weight,
+            bias=bias,
+            out_dtype=out_dtype,
+            scale_a=scale_a,
+            scale_b=scale_b,
+        )
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -528,8 +528,7 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     "VLLM_USE_V1":
     lambda: bool(int(os.getenv("VLLM_USE_V1", "1"))),
 
-    # Disable aiter ops unless specifically enabled.
-    # Acts as a parent switch to enable the rest of the other operations.
+    # use aiter ops unless specifically disabled
     "VLLM_ROCM_USE_AITER":
     lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
              ("true", "1")),
@@ -541,11 +540,10 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
              ("true", "1")),
 
     # use aiter linear op if aiter ops are enabled
-    # The following list of related ops
-    # - scaled_mm (per-tensor / rowwise)
     "VLLM_ROCM_USE_AITER_LINEAR":
-    lambda: (os.getenv("VLLM_ROCM_USE_AITER_LINEAR", "True").lower() in
-             ("true", "1")),
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
+             ("true", "1") and os.getenv("VLLM_ROCM_USE_AITER_LINEAR", "True"
+                                         ).lower() in ("true", "1")),
 
     # Whether to use aiter moe ops.
     # By default is enabled.

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
@@ -181,6 +181,10 @@ def apply(self,
 class UnquantizedLinearMethod(LinearMethodBase):
     """Linear method without quantization."""
 
+    def __init__(self):
+        super().__init__()
+        self._gemm_func = dispatch_unquantized_gemm()
+
     def create_weights(self, layer: torch.nn.Module,
                        input_size_per_partition: int,
                        output_partition_sizes: list[int], input_size: int,
@@ -199,7 +203,7 @@ def apply(self,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
-        return dispatch_unquantized_gemm()(x, layer.weight, bias)
+        return self._gemm_func(x, layer.weight, bias)
 
 
 class LinearBase(torch.nn.Module):